In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import Dataset, DataLoader
In [2]:
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive

Load Data¶

In [4]:
df_train = pd.DataFrame(pd.read_csv('/content/drive/MyDrive/DL_Project/trainv1.csv'))
df_test = pd.DataFrame(pd.read_csv('/content/drive/MyDrive/DL_Project/testv1.csv'))
df_train
<ipython-input-4-cf622a5b4f5d>:1: DtypeWarning: Columns (26) have mixed types. Specify dtype option on import or set low_memory=False.
  df_train = pd.DataFrame(pd.read_csv('/content/drive/MyDrive/DL_Project/trainv1.csv'))
Out[4]:
ID Customer_ID Month Name Age SSN Occupation Annual_Income Monthly_Inhand_Salary Num_Bank_Accounts ... Credit_Mix Outstanding_Debt Credit_Utilization_Ratio Credit_History_Age Payment_of_Min_Amount Total_EMI_per_month Amount_invested_monthly Payment_Behaviour Monthly_Balance Credit_Score
0 0x1602 CUS_0xd40 January Aaron Maashoh 23 821-00-0265 Scientist 19114.12 1824.843333 3 ... _ 809.98 26.822620 22 Years and 1 Months No 49.574949 80.41529543900253 High_spent_Small_value_payments 312.49408867943663 Good
1 0x1603 CUS_0xd40 February Aaron Maashoh 23 821-00-0265 Scientist 19114.12 NaN 3 ... Good 809.98 31.944960 NaN No 49.574949 118.28022162236736 Low_spent_Large_value_payments 284.62916249607184 Good
2 0x1604 CUS_0xd40 March Aaron Maashoh -500 821-00-0265 Scientist 19114.12 NaN 3 ... Good 809.98 28.609352 22 Years and 3 Months No 49.574949 81.699521264648 Low_spent_Medium_value_payments 331.2098628537912 Good
3 0x1605 CUS_0xd40 April Aaron Maashoh 23 821-00-0265 Scientist 19114.12 NaN 3 ... Good 809.98 31.377862 22 Years and 4 Months No 49.574949 199.4580743910713 Low_spent_Small_value_payments 223.45130972736786 Good
4 0x1606 CUS_0xd40 May Aaron Maashoh 23 821-00-0265 Scientist 19114.12 1824.843333 3 ... Good 809.98 24.797347 22 Years and 5 Months No 49.574949 41.420153086217326 High_spent_Medium_value_payments 341.48923103222177 Good
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
99995 0x25fe9 CUS_0x942c April Nicks 25 078-73-5990 Mechanic 39628.99 3359.415833 4 ... _ 502.38 34.663572 31 Years and 6 Months No 35.104023 60.97133255718485 High_spent_Large_value_payments 479.866228 Poor
99996 0x25fea CUS_0x942c May Nicks 25 078-73-5990 Mechanic 39628.99 3359.415833 4 ... _ 502.38 40.565631 31 Years and 7 Months No 35.104023 54.18595028760385 High_spent_Medium_value_payments 496.65161 Poor
99997 0x25feb CUS_0x942c June Nicks 25 078-73-5990 Mechanic 39628.99 3359.415833 4 ... Good 502.38 41.255522 31 Years and 8 Months No 35.104023 24.02847744864441 High_spent_Large_value_payments 516.809083 Poor
99998 0x25fec CUS_0x942c July Nicks 25 078-73-5990 Mechanic 39628.99 3359.415833 4 ... Good 502.38 33.638208 31 Years and 9 Months No 35.104023 251.67258219721603 Low_spent_Large_value_payments 319.164979 Standard
99999 0x25fed CUS_0x942c August Nicks 25 078-73-5990 Mechanic 39628.99_ 3359.415833 4 ... Good 502.38 34.192463 31 Years and 10 Months No 35.104023 167.1638651610451 !@9#%8 393.673696 Poor

100000 rows × 28 columns

In [5]:
df_test
Out[5]:
ID Customer_ID Month Name Age SSN Occupation Annual_Income Monthly_Inhand_Salary Num_Bank_Accounts ... Num_Credit_Inquiries Credit_Mix Outstanding_Debt Credit_Utilization_Ratio Credit_History_Age Payment_of_Min_Amount Total_EMI_per_month Amount_invested_monthly Payment_Behaviour Monthly_Balance
0 0x160a CUS_0xd40 September Aaron Maashoh 23 821-00-0265 Scientist 19114.12 1824.843333 3 ... 2022.0 Good 809.98 35.030402 22 Years and 9 Months No 49.574949 236.64268203272135 Low_spent_Small_value_payments 186.26670208571772
1 0x160b CUS_0xd40 October Aaron Maashoh 24 821-00-0265 Scientist 19114.12 1824.843333 3 ... 4.0 Good 809.98 33.053114 22 Years and 10 Months No 49.574949 21.465380264657146 High_spent_Medium_value_payments 361.44400385378196
2 0x160c CUS_0xd40 November Aaron Maashoh 24 821-00-0265 Scientist 19114.12 1824.843333 3 ... 4.0 Good 809.98 33.811894 NaN No 49.574949 148.23393788500925 Low_spent_Medium_value_payments 264.67544623342997
3 0x160d CUS_0xd40 December Aaron Maashoh 24_ 821-00-0265 Scientist 19114.12 NaN 3 ... 4.0 Good 809.98 32.430559 23 Years and 0 Months No 49.574949 39.08251089460281 High_spent_Medium_value_payments 343.82687322383634
4 0x1616 CUS_0x21b1 September Rick Rothackerj 28 004-07-5839 _______ 34847.84 3037.986667 2 ... 5.0 Good 605.03 25.926822 27 Years and 3 Months No 18.816215 39.684018417945296 High_spent_Large_value_payments 485.2984336755923
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
49995 0x25fe5 CUS_0x8600 December Sarah McBridec 4975 031-35-0942 Architect 20002.88 1929.906667 10 ... 12.0 _ 3571.7 34.780553 NaN Yes 60.964772 146.48632477751087 Low_spent_Small_value_payments 275.53956951573343
49996 0x25fee CUS_0x942c September Nicks 25 078-73-5990 Mechanic 39628.99 NaN 4 ... 7.0 Good 502.38 27.758522 31 Years and 11 Months NM 35.104023 181.44299902757518 Low_spent_Small_value_payments 409.39456169535066
49997 0x25fef CUS_0x942c October Nicks 25 078-73-5990 Mechanic 39628.99 3359.415833 4 ... 7.0 Good 502.38 36.858542 32 Years and 0 Months No 35.104023 __10000__ Low_spent_Large_value_payments 349.7263321025098
49998 0x25ff0 CUS_0x942c November Nicks 25 078-73-5990 Mechanic 39628.99 NaN 4 ... 7.0 Good 502.38 39.139840 32 Years and 1 Months No 35.104023 97.59857973344877 High_spent_Small_value_payments 463.23898098947717
49999 0x25ff1 CUS_0x942c December Nicks 25 078-73-5990 Mechanic 39628.99 3359.415833 4 ... 7.0 _ 502.38 34.108530 32 Years and 2 Months No 35.104023 220.45787812168732 Low_spent_Medium_value_payments 360.37968260123847

50000 rows × 27 columns

In [6]:
df_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ID                        100000 non-null  object 
 1   Customer_ID               100000 non-null  object 
 2   Month                     100000 non-null  object 
 3   Name                      90015 non-null   object 
 4   Age                       100000 non-null  object 
 5   SSN                       100000 non-null  object 
 6   Occupation                100000 non-null  object 
 7   Annual_Income             100000 non-null  object 
 8   Monthly_Inhand_Salary     84998 non-null   float64
 9   Num_Bank_Accounts         100000 non-null  int64  
 10  Num_Credit_Card           100000 non-null  int64  
 11  Interest_Rate             100000 non-null  int64  
 12  Num_of_Loan               100000 non-null  object 
 13  Type_of_Loan              88592 non-null   object 
 14  Delay_from_due_date       100000 non-null  int64  
 15  Num_of_Delayed_Payment    92998 non-null   object 
 16  Changed_Credit_Limit      100000 non-null  object 
 17  Num_Credit_Inquiries      98035 non-null   float64
 18  Credit_Mix                100000 non-null  object 
 19  Outstanding_Debt          100000 non-null  object 
 20  Credit_Utilization_Ratio  100000 non-null  float64
 21  Credit_History_Age        90970 non-null   object 
 22  Payment_of_Min_Amount     100000 non-null  object 
 23  Total_EMI_per_month       100000 non-null  float64
 24  Amount_invested_monthly   95521 non-null   object 
 25  Payment_Behaviour         100000 non-null  object 
 26  Monthly_Balance           98800 non-null   object 
 27  Credit_Score              100000 non-null  object 
dtypes: float64(4), int64(4), object(20)
memory usage: 21.4+ MB
In [7]:
for col in df_train.columns:
    unique_values = df_train[col].unique()
    print(f"Column: {col}")
    print(f"Unique Values (Top 20): {unique_values[:20]}")
    print(f"Total Unique Values: {len(unique_values)}")
    print("=" * 20)
Column: ID
Unique Values (Top 20): ['0x1602' '0x1603' '0x1604' '0x1605' '0x1606' '0x1607' '0x1608' '0x1609'
 '0x160e' '0x160f' '0x1610' '0x1611' '0x1612' '0x1613' '0x1614' '0x1615'
 '0x161a' '0x161b' '0x161c' '0x161d']
Total Unique Values: 100000
====================
Column: Customer_ID
Unique Values (Top 20): ['CUS_0xd40' 'CUS_0x21b1' 'CUS_0x2dbc' 'CUS_0xb891' 'CUS_0x1cdb'
 'CUS_0x95ee' 'CUS_0x284a' 'CUS_0x5407' 'CUS_0x4157' 'CUS_0xba08'
 'CUS_0xa66b' 'CUS_0xc0ab' 'CUS_0x3e45' 'CUS_0x6c66' 'CUS_0xff4'
 'CUS_0x33d2' 'CUS_0x6070' 'CUS_0xfdb' 'CUS_0x3553' 'CUS_0x4100']
Total Unique Values: 12500
====================
Column: Month
Unique Values (Top 20): ['January' 'February' 'March' 'April' 'May' 'June' 'July' 'August']
Total Unique Values: 8
====================
Column: Name
Unique Values (Top 20): ['Aaron Maashoh' nan 'Rick Rothackerj' 'Langep' 'Jasond' 'Deepaa' 'Np'
 'Nadiaq' 'Annk' 'Charlie Zhur' 'Jamesj' 'Saphirj' 'Soyoungd'
 'Harriet McLeodd' 'Sinead Carews' 'Poornimaf' 'Chalmersa' 'Parkm'
 'Patrickg' 'Laurence Frosty']
Total Unique Values: 10140
====================
Column: Age
Unique Values (Top 20): ['23' '-500' '28_' '28' '34' '54' '55' '21' '31' '33' '34_' '7580' '30'
 '30_' '24' '24_' '44' '45' '40' '41']
Total Unique Values: 1788
====================
Column: SSN
Unique Values (Top 20): ['821-00-0265' '#F%$D@*&8' '004-07-5839' '486-85-3974' '072-31-6145'
 '615-06-7821' '612-70-8987' '411-51-0676' '500-92-6408' '070-19-1622'
 '366-68-1681' '221-30-8554' '342-90-2649' '414-53-2918' '328-33-6328'
 '655-05-7666' '965-46-2491' '891-55-9364' '928-91-4452' '084-25-3745']
Total Unique Values: 12501
====================
Column: Occupation
Unique Values (Top 20): ['Scientist' '_______' 'Teacher' 'Engineer' 'Entrepreneur' 'Developer'
 'Lawyer' 'Media_Manager' 'Doctor' 'Journalist' 'Manager' 'Accountant'
 'Musician' 'Mechanic' 'Writer' 'Architect']
Total Unique Values: 16
====================
Column: Annual_Income
Unique Values (Top 20): ['19114.12' '34847.84' '34847.84_' '143162.64' '30689.89' '30689.89_'
 '35547.71_' '35547.71' '73928.46' '131313.4' '10909427.0' '34081.38_'
 '34081.38' '114838.41' '114838.41_' '31370.8' '33751.27' '88640.24'
 '88640.24_' '54392.16']
Total Unique Values: 18940
====================
Column: Monthly_Inhand_Salary
Unique Values (Top 20): [ 1824.84333333            nan  3037.98666667 12187.22
  2612.49083333  2853.30916667  5988.705      11242.78333333
 10469.20775939  2611.115       9843.8675      2825.23333333
  2948.60583333  7266.68666667  4766.68         519.12875
  2415.855       2942.14833333  7591.59        2898.385     ]
Total Unique Values: 13236
====================
Column: Num_Bank_Accounts
Unique Values (Top 20): [   3    2    1    7    4    0    8    5    6    9   10 1414 1231   67
  572 1488   91  528 1647 1696]
Total Unique Values: 943
====================
Column: Num_Credit_Card
Unique Values (Top 20): [   4 1385    5 1288    1    7    6 1029  488    8 1381  898    3  518
 1005    9 1327 1189    2   10]
Total Unique Values: 1179
====================
Column: Interest_Rate
Unique Values (Top 20): [   3    6    8    4    5 5318   15    7   12   20    1  433   14   32
   16   17 5240 4975   10   31]
Total Unique Values: 1750
====================
Column: Num_of_Loan
Unique Values (Top 20): ['4' '1' '3' '967' '-100' '0' '0_' '2' '3_' '2_' '7' '5' '5_' '6' '8' '8_'
 '9' '9_' '4_' '7_']
Total Unique Values: 434
====================
Column: Type_of_Loan
Unique Values (Top 20): ['Auto Loan, Credit-Builder Loan, Personal Loan, and Home Equity Loan'
 'Credit-Builder Loan' 'Auto Loan, Auto Loan, and Not Specified'
 'Not Specified' nan 'Credit-Builder Loan, and Mortgage Loan'
 'Not Specified, Auto Loan, and Student Loan'
 'Personal Loan, Debt Consolidation Loan, and Auto Loan'
 'Not Specified, and Payday Loan'
 'Credit-Builder Loan, Personal Loan, and Auto Loan'
 'Payday Loan, and Payday Loan'
 'Not Specified, Student Loan, and Personal Loan'
 'Personal Loan, Payday Loan, Student Loan, Auto Loan, Home Equity Loan, Student Loan, and Payday Loan'
 'Not Specified, Student Loan, Student Loan, Credit-Builder Loan, and Auto Loan'
 'Payday Loan, and Home Equity Loan'
 'Credit-Builder Loan, Not Specified, Mortgage Loan, Payday Loan, Credit-Builder Loan, and Personal Loan'
 'Mortgage Loan, Debt Consolidation Loan, Payday Loan, Auto Loan, and Not Specified'
 'Credit-Builder Loan, Mortgage Loan, Mortgage Loan, Credit-Builder Loan, and Student Loan'
 'Not Specified, Student Loan, and Student Loan'
 'Payday Loan, Not Specified, Credit-Builder Loan, Debt Consolidation Loan, Payday Loan, Not Specified, Student Loan, and Student Loan']
Total Unique Values: 6261
====================
Column: Delay_from_due_date
Unique Values (Top 20): [ 3 -1  5  6  8  7 13 10  0  4  9  1 12 11 30 31 34 27 14  2]
Total Unique Values: 73
====================
Column: Num_of_Delayed_Payment
Unique Values (Top 20): ['7' nan '4' '8_' '6' '1' '-1' '3_' '0' '8' '5' '3' '9' '12' '15' '17'
 '10' '2' '2_' '11']
Total Unique Values: 750
====================
Column: Changed_Credit_Limit
Unique Values (Top 20): ['11.27' '_' '6.27' '9.27' '5.42' '7.42' '6.42' '7.1' '11.1' '9.1' '1.99'
 '-2.01' '-1.01' '-3.01' '2.58' '10.14' '9.14' '9.34' '15.34' '8.34']
Total Unique Values: 4384
====================
Column: Num_Credit_Inquiries
Unique Values (Top 20): [4.000e+00 2.000e+00 3.000e+00       nan 5.000e+00 9.000e+00 8.000e+00
 7.000e+00 6.000e+00 0.000e+00 1.000e+00 1.000e+01 1.050e+03 1.100e+01
 1.200e+01 1.044e+03 1.700e+01 1.936e+03 1.300e+01 5.680e+02]
Total Unique Values: 1224
====================
Column: Credit_Mix
Unique Values (Top 20): ['_' 'Good' 'Standard' 'Bad']
Total Unique Values: 4
====================
Column: Outstanding_Debt
Unique Values (Top 20): ['809.98' '605.03' '1303.01' '632.46' '943.86' '548.2' '352.16' '1704.18'
 '1377.74' '421.43' '1328.93' '1328.93_' '950.36' '179.22' '2602.69'
 '758.44' '818.22' '1296.64' '1283.37' '1283.37_']
Total Unique Values: 13178
====================
Column: Credit_Utilization_Ratio
Unique Values (Top 20): [26.82261962 31.94496006 28.60935202 31.37786187 24.79734691 27.26225871
 22.53759303 23.9337948  24.46403064 38.55084843 33.22495079 39.18265566
 34.97789475 33.3810102  31.13170161 32.93385629 28.61673482 41.70257342
 26.51981539 39.50164811]
Total Unique Values: 100000
====================
Column: Credit_History_Age
Unique Values (Top 20): ['22 Years and 1 Months' nan '22 Years and 3 Months'
 '22 Years and 4 Months' '22 Years and 5 Months' '22 Years and 6 Months'
 '22 Years and 7 Months' '26 Years and 7 Months' '26 Years and 8 Months'
 '26 Years and 9 Months' '26 Years and 10 Months' '26 Years and 11 Months'
 '27 Years and 0 Months' '27 Years and 1 Months' '27 Years and 2 Months'
 '17 Years and 9 Months' '17 Years and 10 Months' '17 Years and 11 Months'
 '18 Years and 1 Months' '18 Years and 2 Months']
Total Unique Values: 405
====================
Column: Payment_of_Min_Amount
Unique Values (Top 20): ['No' 'NM' 'Yes']
Total Unique Values: 3
====================
Column: Total_EMI_per_month
Unique Values (Top 20): [4.95749492e+01 1.88162146e+01 2.46992319e+02 1.64154517e+01
 0.00000000e+00 1.50150000e+04 1.55150000e+04 1.37644605e+02
 9.11220179e+02 2.38340000e+04 3.26620000e+04 7.04783327e+01
 2.26892792e+02 4.66161291e+01 1.64150000e+04 6.50081743e+01
 1.35173371e+02 8.03570000e+04 1.24392082e+02 3.65481972e+01]
Total Unique Values: 14950
====================
Column: Amount_invested_monthly
Unique Values (Top 20): ['80.41529543900253' '118.28022162236736' '81.699521264648'
 '199.4580743910713' '41.420153086217326' '62.430172331195294'
 '178.3440674122349' '24.785216509052056' '104.291825168246'
 '40.39123782853101' '58.51597569589465' '99.30622796053305'
 '130.11542024292334' '43.477190144355745' '70.10177420755677'
 '218.90434353388733' '168.413702679309' '232.86038375993544' '__10000__'
 '825.2162699393922']
Total Unique Values: 91050
====================
Column: Payment_Behaviour
Unique Values (Top 20): ['High_spent_Small_value_payments' 'Low_spent_Large_value_payments'
 'Low_spent_Medium_value_payments' 'Low_spent_Small_value_payments'
 'High_spent_Medium_value_payments' '!@9#%8'
 'High_spent_Large_value_payments']
Total Unique Values: 7
====================
Column: Monthly_Balance
Unique Values (Top 20): ['312.49408867943663' '284.62916249607184' '331.2098628537912'
 '223.45130972736786' '341.48923103222177' '340.4792117872438'
 '244.5653167062043' '358.12416760938714' '470.69062692529184'
 '484.5912142650067' '466.46647639764313' '465.6762241330048'
 '444.8670318506144' '481.505261949182' '464.8806778859809'
 '356.07810855965045' '1043.3159778669492' '998.8692967863226'
 '715.741367403555' '426.5134106068658']
Total Unique Values: 98793
====================
Column: Credit_Score
Unique Values (Top 20): ['Good' 'Standard' 'Poor']
Total Unique Values: 3
====================
In [8]:
df_train.isnull().sum()
Out[8]:
0
ID 0
Customer_ID 0
Month 0
Name 9985
Age 0
SSN 0
Occupation 0
Annual_Income 0
Monthly_Inhand_Salary 15002
Num_Bank_Accounts 0
Num_Credit_Card 0
Interest_Rate 0
Num_of_Loan 0
Type_of_Loan 11408
Delay_from_due_date 0
Num_of_Delayed_Payment 7002
Changed_Credit_Limit 0
Num_Credit_Inquiries 1965
Credit_Mix 0
Outstanding_Debt 0
Credit_Utilization_Ratio 0
Credit_History_Age 9030
Payment_of_Min_Amount 0
Total_EMI_per_month 0
Amount_invested_monthly 4479
Payment_Behaviour 0
Monthly_Balance 1200
Credit_Score 0

In [9]:
df_test.isnull().sum()
Out[9]:
0
ID 0
Customer_ID 0
Month 0
Name 5015
Age 0
SSN 0
Occupation 0
Annual_Income 0
Monthly_Inhand_Salary 7498
Num_Bank_Accounts 0
Num_Credit_Card 0
Interest_Rate 0
Num_of_Loan 0
Type_of_Loan 5704
Delay_from_due_date 0
Num_of_Delayed_Payment 3498
Changed_Credit_Limit 0
Num_Credit_Inquiries 1035
Credit_Mix 0
Outstanding_Debt 0
Credit_Utilization_Ratio 0
Credit_History_Age 4470
Payment_of_Min_Amount 0
Total_EMI_per_month 0
Amount_invested_monthly 2271
Payment_Behaviour 0
Monthly_Balance 562

Exploratory Data Analysis¶

In [10]:
df_train['is_train'] = 1
df_test['is_train'] = 0

# Combining the datasets into one DataFrame
df_group = pd.concat([df_train, df_test], ignore_index=True)

Reading & Cleaning¶

In [11]:
df_group.describe(include="all").T
Out[11]:
count unique top freq mean std min 25% 50% 75% max
ID 150000 150000 0x1602 1 NaN NaN NaN NaN NaN NaN NaN
Customer_ID 150000 12500 CUS_0xd40 12 NaN NaN NaN NaN NaN NaN NaN
Month 150000 12 January 12500 NaN NaN NaN NaN NaN NaN NaN
Name 135000 10139 Stevex 66 NaN NaN NaN NaN NaN NaN NaN
Age 150000 2524 39 4198 NaN NaN NaN NaN NaN NaN NaN
SSN 150000 12501 #F%$D@*&8 8400 NaN NaN NaN NaN NaN NaN NaN
Occupation 150000 16 _______ 10500 NaN NaN NaN NaN NaN NaN NaN
Annual_Income 150000 21192 36585.12 24 NaN NaN NaN NaN NaN NaN NaN
Monthly_Inhand_Salary 127500.0 NaN NaN NaN 4190.115139 3180.489657 303.645417 1625.265833 3091.0 5948.454596 15204.633333
Num_Bank_Accounts 150000.0 NaN NaN NaN 17.00694 117.069476 -1.0 3.0 6.0 7.0 1798.0
Num_Credit_Card 150000.0 NaN NaN NaN 22.623447 129.143006 0.0 4.0 5.0 7.0 1499.0
Interest_Rate 150000.0 NaN NaN NaN 71.234907 461.537193 1.0 8.0 13.0 20.0 5799.0
Num_of_Loan 150000 623 3 21500 NaN NaN NaN NaN NaN NaN NaN
Type_of_Loan 132888 6260 Not Specified 2112 NaN NaN NaN NaN NaN NaN NaN
Delay_from_due_date 150000.0 NaN NaN NaN 21.0634 14.860154 -5.0 10.0 18.0 28.0 67.0
Num_of_Delayed_Payment 139500 1058 19 7949 NaN NaN NaN NaN NaN NaN NaN
Changed_Credit_Limit 150000 4605 _ 3150 NaN NaN NaN NaN NaN NaN NaN
Num_Credit_Inquiries 147000.0 NaN NaN NaN 28.529014 194.456058 0.0 3.0 6.0 9.0 2597.0
Credit_Mix 150000 4 Standard 54858 NaN NaN NaN NaN NaN NaN NaN
Outstanding_Debt 150000 13622 1360.45 36 NaN NaN NaN NaN NaN NaN NaN
Credit_Utilization_Ratio 150000.0 NaN NaN NaN 32.283309 5.113315 20.0 28.054731 32.297058 36.487954 50.0
Credit_History_Age 136500 408 17 Years and 11 Months 628 NaN NaN NaN NaN NaN NaN NaN
Payment_of_Min_Amount 150000 3 Yes 78484 NaN NaN NaN NaN NaN NaN NaN
Total_EMI_per_month 150000.0 NaN NaN NaN 1432.513579 8403.759977 0.0 30.947775 71.280006 166.279555 82398.0
Amount_invested_monthly 143250 136497 __10000__ 6480 NaN NaN NaN NaN NaN NaN NaN
Payment_Behaviour 150000 7 Low_spent_Small_value_payments 38207 NaN NaN NaN NaN NaN NaN NaN
Monthly_Balance 148238 148224 __-333333333333333333333333333__ 15 NaN NaN NaN NaN NaN NaN NaN
Credit_Score 100000 3 Standard 53174 NaN NaN NaN NaN NaN NaN NaN
is_train 150000.0 NaN NaN NaN 0.666667 0.471406 0.0 0.0 1.0 1.0 1.0
In [12]:
df_group.isnull().any()
Out[12]:
0
ID False
Customer_ID False
Month False
Name True
Age False
SSN False
Occupation False
Annual_Income False
Monthly_Inhand_Salary True
Num_Bank_Accounts False
Num_Credit_Card False
Interest_Rate False
Num_of_Loan False
Type_of_Loan True
Delay_from_due_date False
Num_of_Delayed_Payment True
Changed_Credit_Limit False
Num_Credit_Inquiries True
Credit_Mix False
Outstanding_Debt False
Credit_Utilization_Ratio False
Credit_History_Age True
Payment_of_Min_Amount False
Total_EMI_per_month False
Amount_invested_monthly True
Payment_Behaviour False
Monthly_Balance True
Credit_Score True
is_train False

In [13]:
credit_score_counts = df_group['Credit_Score'].value_counts()

colors = ['#FF9999', '#66B2FF', '#99FF99', '#FFCC99', '#C299FF']

plt.figure(figsize=(10, 8))
plt.bar(credit_score_counts.index, credit_score_counts.values, color=colors[:len(credit_score_counts)], edgecolor='black')
plt.title('Distribution of Target Variable: Credit_Score', fontsize=14)
plt.xlabel('Credit_Score', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
No description has been provided for this image

The chart shows the distribution of credit scores, with most individuals falling into the Standard category. The Poor category follows, while the Good category has the fewest individuals.

In [14]:
colors = ['#FF5733', '#33FFBD', '#337BFF', '#FF33A1', '#A133FF']

plt.figure(figsize=(8, 6))
plt.pie(credit_score_counts.values, labels=credit_score_counts.index, colors=colors, autopct='%1.1f%%', startangle=90, pctdistance=0.85)


centre_circle = plt.Circle((0, 0), 0.70, fc='white')
plt.gca().add_artist(centre_circle)

plt.title('Distribution of Target Variable: Credit_Score', fontsize=14)
plt.show()
No description has been provided for this image

The pie chart shows that 53.2% of individuals fall into the Standard category, 29.0% into Poor, and 17.8% into Good.

Business Implications:¶

  1. Focus on improving the creditworthiness of the Poor segment with tailored strategies.
  2. Target the Good segment with premium products or higher credit limits to maximize profits.
  3. Maintain the Standard group with loyalty programs and monitor their credit behaviors.
In [15]:
df_group['ID'].duplicated().sum()
Out[15]:
0
In [16]:
df_group['Name'] = df_group.groupby('Customer_ID')['Name'].transform(lambda x: x.fillna(x.mode()[0]))
In [17]:
df_group['Age_Cleaned'] = pd.to_numeric(df_group['Age'].str.extract(r'(\d+)')[0], errors='coerce')
df_group['Age_Cleaned'] = df_group['Age_Cleaned'].fillna(df_group['Age_Cleaned'].median())
df_group['Age_Cleaned'] = df_group['Age_Cleaned'].astype(int)
print(df_group[['Age', 'Age_Cleaned']].head())
    Age  Age_Cleaned
0    23           23
1    23           23
2  -500          500
3    23           23
4    23           23
In [18]:
import seaborn as sns
import matplotlib.pyplot as plt

# Density Plot
plt.figure(figsize=(10, 5))
sns.kdeplot(df_group['Age_Cleaned'], shade=True, color='blue')
plt.title('Density Plot of Variable: Age', fontsize=14)
plt.xlabel('Age', fontsize=12)
plt.ylabel('Density', fontsize=12)
plt.grid(True)
plt.show()

# Violin Plot
plt.figure(figsize=(5, 5))
sns.violinplot(x=df_group['Age_Cleaned'], palette='muted')
plt.title('Violin Plot Distribution of Variable: Age', fontsize=14)
plt.xlabel('Age', fontsize=12)
plt.grid(True)
plt.show()
<ipython-input-18-3038c387d625>:6: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df_group['Age_Cleaned'], shade=True, color='blue')
No description has been provided for this image
<ipython-input-18-3038c387d625>:15: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x=df_group['Age_Cleaned'], palette='muted')
No description has been provided for this image
In [19]:
import re

def is_valid_ssn(ssn):
    pattern = r'^\d{3}-\d{2}-\d{4}$'
    return bool(re.match(pattern, ssn))

df_group['Is_Valid_SSN'] = df_group['SSN'].apply(is_valid_ssn)

valid_count = df_group['Is_Valid_SSN'].value_counts()

labels = ['Valid', 'Invalid']
colors = ['#66CC66', '#FF6666']
plt.figure(figsize=(6, 6))
plt.pie(valid_count, labels=labels, autopct='%1.1f%%', startangle=90, colors=colors, wedgeprops={'edgecolor': 'black'})
plt.title(' SSN Valid vs Invalid')
plt.show()
No description has been provided for this image

The chart shows 94.4% valid SSNs and 5.6% invalid SSNs.

Business Implications:¶

  1. Automate invalid SSN detection to improve efficiency and data integrity.
  2. Investigate invalid SSNs for potential fraud or incomplete records.
  3. Educate customers on submitting accurate SSNs to reduce errors.
  4. Ensure valid SSNs to meet regulatory and compliance requirements.

SSN

In [20]:
df_group[df_group['Is_Valid_SSN'] == 0]['SSN']
Out[20]:
SSN
7 #F%$D@*&8
29 #F%$D@*&8
51 #F%$D@*&8
54 #F%$D@*&8
98 #F%$D@*&8
... ...
149937 #F%$D@*&8
149950 #F%$D@*&8
149955 #F%$D@*&8
149968 #F%$D@*&8
149973 #F%$D@*&8

8400 rows × 1 columns


In [21]:
df_group['SSN_Cleaned'] = df_group['SSN'].apply(lambda x: x if isinstance(x, str) and len(x.split('-')) == 3 and all(part.isdigit() for part in x.split('-')) and len(x.split('-')[0]) == 3 and len(x.split('-')[1]) == 2 and len(x.split('-')[2]) == 4 else np.nan)

df_group['SSN_Cleaned'] = df_group.groupby('Customer_ID')['SSN_Cleaned'].transform(lambda x: x.ffill().bfill())

df_group['Is_Valid_SSN'] = df_group['SSN_Cleaned'].notnull()

df_group['Is_Valid_SSN'].value_counts()
Out[21]:
count
Is_Valid_SSN
True 150000

In [22]:
plt.figure(figsize=(15, 8))
df_group['Occupation'].value_counts().plot(kind='barh', color='skyblue', edgecolor='black')
plt.title('Distribution of Variable: Occupation', fontsize=16)
plt.xlabel('Count', fontsize=14)
plt.ylabel('Occupation', fontsize=14)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show()
No description has been provided for this image
In [23]:
df_group["Occupation"].value_counts()
Out[23]:
count
Occupation
_______ 10500
Lawyer 9899
Engineer 9562
Architect 9550
Mechanic 9459
Accountant 9404
Scientist 9403
Developer 9381
Media_Manager 9362
Teacher 9318
Entrepreneur 9277
Journalist 9122
Doctor 9114
Manager 8973
Musician 8858
Writer 8818

In [24]:
df_group['Occupation'] = df_group['Occupation'].replace('_______', np.nan)
df_group['Occupation'] = df_group.groupby('Customer_ID')['Occupation'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
df_group["Occupation"].value_counts()
<ipython-input-24-9ed0b60522c0>:2: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
  df_group['Occupation'] = df_group.groupby('Customer_ID')['Occupation'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
Out[24]:
count
Occupation
Lawyer 10644
Engineer 10296
Architect 10236
Mechanic 10164
Scientist 10116
Accountant 10116
Developer 10080
Media_Manager 10080
Teacher 10008
Entrepreneur 9972
Doctor 9852
Journalist 9804
Manager 9648
Musician 9528
Writer 9456

In [25]:
occupation_counts = df_group['Occupation'].value_counts()

plt.figure(figsize=(10, 6))
occupation_counts.plot(kind='barh', color='steelblue', edgecolor='black')
plt.title('Distribution of Variable: Occupation')
plt.xlabel('Count')
plt.ylabel('Occupation')
plt.show()
No description has been provided for this image
In [26]:
df_group['Monthly_Inhand_Salary'] = df_group.groupby('Customer_ID')['Monthly_Inhand_Salary'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
df_group['Monthly_Inhand_Salary'].isnull().sum()
<ipython-input-26-1f58bad5e2a4>:1: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
  df_group['Monthly_Inhand_Salary'] = df_group.groupby('Customer_ID')['Monthly_Inhand_Salary'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
Out[26]:
0
In [27]:
# Histogram
sns.histplot(df_group['Monthly_Inhand_Salary'], kde=True, bins=30, color='purple')
plt.title('Distribution of Variable: Monthly_Inhand_Salary')
plt.xlabel('Monthly_Inhand_Salary')
plt.ylabel('Count')
plt.grid(True)
plt.show()

plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Monthly_Inhand_Salary'], vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Monthly_Inhand_Salary')
plt.xlabel('Monthly_Inhand_Salary')
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
In [29]:
df_group['Annual_Income_Cleaned'] = df_group['Annual_Income'].str.replace('_', '', regex=False).astype(float)

sns.histplot(df_group['Annual_Income_Cleaned'], kde=True, bins=30, color='purple')
plt.title('Distribution of Variable: Annual_Income')
plt.xlabel('Annual_Income')
plt.ylabel('Count')
plt.grid(True)
plt.show()

plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Annual_Income_Cleaned'], vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Annual_Income')
plt.xlabel('Annual_Income')
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
In [30]:
income_ratio = df_group['Annual_Income_Cleaned'] / df_group['Monthly_Inhand_Salary']
print(income_ratio.describe())
count    150000.000000
mean         65.262083
std         801.323485
min           8.089821
25%          11.561407
50%          12.035127
75%          12.573912
max       54110.522117
dtype: float64

The average income ratio indicates that most individuals have annual incomes around 12-13 times their monthly salary, aligning with standard income patterns. Outliers, like extremely high ratios, may suggest errors, bonuses, or irregular income sources. Businesses should focus on accurate income verification and tailor financial products based on realistic income ratios.

In [31]:
Q1 = income_ratio.quantile(0.25)
Q3 = income_ratio.quantile(0.75)
IQR = Q3 - Q1

upper_bound = Q3 + 1.5 * IQR
upper_bound
Out[31]:
14.092668813782176

We define a reasonable upper limit for outliers as a scenario where the individual might receive bonuses or other additional income equal to 14 times the monthly salary. Any data point exceeding this threshold can be considered an outlier, as it deviates from the expected annual income range, even when accounting for bonuses and other irregular earnings.

In [35]:
ann_module = df_group.groupby("Customer_ID")["Annual_Income_Cleaned"].agg(
    lambda x: x.mode().iloc[0])

df_group["Annual_Income_Cleaned"] = df_group.apply(
    lambda row: ann_module[row["Customer_ID"]]
    if row["Annual_Income_Cleaned"] > row["Monthly_Inhand_Salary"] * 14
    else row["Annual_Income_Cleaned"],
    axis=1
)
In [36]:
df_group['Num_of_Loan'].value_counts()
Out[36]:
count
Num_of_Loan
3 21500
2 21423
4 20998
0 15543
1 15112
... ...
291 1
365 1
1014 1
1129_ 1
1296_ 1

623 rows × 1 columns


In [37]:
df_group['Num_of_Loan_Cleaned'] = df_group['Num_of_Loan'].str.replace(r'\D', '', regex=True).astype(int)

# Density Plot
plt.figure(figsize=(10, 6))
sns.kdeplot(df_group['Num_of_Loan_Cleaned'], fill=True, color='green')
plt.title('Density Plot of Variable: Num_of_Loan', fontsize=14)
plt.xlabel('Num_of_Loan', fontsize=12)
plt.ylabel('Density', fontsize=12)
plt.grid(True)
plt.show()

# Violin Plot
plt.figure(figsize=(8, 6))
sns.violinplot(x=df_group['Num_of_Loan_Cleaned'], color='purple')
plt.title('Violin Plot of Variable: Num_of_Loan', fontsize=14)
plt.xlabel('Num_of_Loan', fontsize=12)
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
In [38]:
df_group[df_group['Num_of_Loan_Cleaned'] < 100]['Num_of_Loan_Cleaned'].value_counts().head(20).sort_index()
Out[38]:
count
Num_of_Loan_Cleaned
0 16376
1 15901
2 22547
3 22618
4 22111
5 10814
6 11705
7 11024
8 4785
9 5539
23 2
31 2
33 2
42 1
49 2
50 3
53 1
55 2
58 3
95 3

The highest value is 9, and any value above this can be considered an anomaly. This threshold helps to identify data points that deviate significantly from the expected range, ensuring a more accurate analysis by flagging outliers for further investigation.

In [39]:
df_group.loc[df_group["Num_of_Loan_Cleaned"] > 9, "Num_of_Loan_Cleaned"] = np.nan

df_group["Num_of_Loan_Cleaned"] = df_group.groupby("Customer_ID")["Num_of_Loan_Cleaned"].transform(
    lambda x: x.fillna(method="ffill").fillna(method="bfill"))
<ipython-input-39-882e7865d870>:4: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
  lambda x: x.fillna(method="ffill").fillna(method="bfill"))
In [40]:
sns.histplot(df_group['Num_of_Loan_Cleaned'], kde=True, bins=30, color='blue')
plt.title('Distribution of Variable: Num_of_Loan_Cleaned')
plt.xlabel('Num_of_Loan_Cleaned')
plt.ylabel('Count')
plt.grid(True)
plt.show()

plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Num_of_Loan_Cleaned'], vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Num_of_Loan_Cleaned')
plt.xlabel('Num_of_Loan_Cleaned')
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
In [41]:
sns.histplot(df_group['Num_Bank_Accounts'], kde=True, bins=30, color='blue')
plt.title('Distribution of Variable: Num_Bank_Accounts')
plt.xlabel('Num_Bank_Accounts')
plt.ylabel('Count')
plt.grid(True)
plt.show()

plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Num_Bank_Accounts'], vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Num_Bank_Accounts')
plt.xlabel('Num_Bank_Accounts')
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
In [42]:
df_group[df_group['Num_Bank_Accounts'] < 100]['Num_Bank_Accounts'].value_counts().head(20).sort_index()
Out[42]:
count
Num_Bank_Accounts
-1 37
0 6494
1 6743
2 6456
3 17905
4 18286
5 18186
6 19505
7 19231
8 19152
9 8181
10 7846
11 28
28 3
34 4
39 3
43 4
70 4
74 5
79 3

The highest value is 10, and any value above this can be considered an anomaly. Similarly, any value below 0 is also anomalous, as it falls outside the valid range. These thresholds help to identify outliers both above and below the expected range, ensuring a cleaner and more accurate dataset for analysis.

In [43]:
df_group.loc[(df_group["Num_Bank_Accounts"] < 0) | (df_group["Num_Bank_Accounts"] > 10),
        "Num_Bank_Accounts"] = np.nan

df_group["Num_Bank_Accounts"] = df_group.groupby("Customer_ID")["Num_Bank_Accounts"].transform(
    lambda x: x.fillna(method="ffill").fillna(method="bfill"))
<ipython-input-43-eef4c02d9120>:5: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
  lambda x: x.fillna(method="ffill").fillna(method="bfill"))
In [44]:
sns.histplot(df_group['Num_Bank_Accounts'], kde=True, bins=30, color='blue')
plt.title('Distribution of Variable: Num_Bank_Accounts')
plt.xlabel('Num_Bank_Accounts')
plt.ylabel('Count')
plt.grid(True)
plt.show()

plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Num_Bank_Accounts'], vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Num_Bank_Accounts')
plt.xlabel('Num_Bank_Accounts')
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
In [45]:
sns.histplot(df_group['Num_Credit_Card'], kde=True, bins=30, color='blue')
plt.title('Distribution of Variable: Num_Credit_Card')
plt.xlabel('Num_Credit_Card')
plt.ylabel('Count')
plt.grid(True)
plt.show()

plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Num_Credit_Card'], vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Num_Credit_Card')
plt.xlabel('Num_Credit_Card')
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
In [46]:
df_group[df_group['Num_Credit_Card'] < 100]['Num_Credit_Card'].value_counts().head(20).sort_index()
Out[46]:
count
Num_Credit_Card
0 29
1 3195
2 3280
3 19816
4 21102
5 27669
6 24802
7 24886
8 7453
9 6976
10 7265
11 77
25 5
28 5
41 5
51 8
66 5
71 5
77 5
92 6

In [47]:
df_group.loc[(df_group["Num_Credit_Card"] < 1) | (df_group["Num_Credit_Card"] > 10),
        "Num_Credit_Card"] = np.nan

df_group["Num_Credit_Card"] = df_group.groupby("Customer_ID")["Num_Credit_Card"].transform(
    lambda x: x.fillna(method="ffill").fillna(method="bfill"))
<ipython-input-47-5754288aab31>:5: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
  lambda x: x.fillna(method="ffill").fillna(method="bfill"))
In [48]:
sns.histplot(df_group['Num_Credit_Card'], kde=True, bins=30, color='blue')
plt.title('Distribution of Variable: Num_Credit_Card')
plt.xlabel('Num_Credit_Card')
plt.ylabel('Count')
plt.grid(True)
plt.show()

plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Num_Credit_Card'], vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Num_Credit_Card')
plt.xlabel('Num_Credit_Card')
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
In [49]:
sns.histplot(df_group['Interest_Rate'], kde=True, bins=30, color='blue')
plt.title('Distribution of Variable: Interest_Rate')
plt.xlabel('Interest_Rate')
plt.ylabel('Count')
plt.grid(True)
plt.show()

plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Interest_Rate'], vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Interest_Rate')
plt.xlabel('Interest_Rate')
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
In [50]:
df_group[df_group['Interest_Rate'] < 20]['Interest_Rate'].value_counts().head(20).sort_index()
Out[50]:
count
Interest_Rate
1 4027
2 3710
3 4153
4 3876
5 7479
6 7089
7 6744
8 7515
9 6747
10 6799
11 6626
12 6828
13 3571
14 3351
15 5984
16 5597
17 5719
18 6154
19 5440

In [51]:
filtered_interest_rate = df_group[(df_group['Interest_Rate'] > 20) & (df_group['Interest_Rate'] < 100)]
interest_rate_counts = filtered_interest_rate['Interest_Rate'].value_counts().head(20).sort_index()

print(interest_rate_counts)
Interest_Rate
21    2335
22    2580
23    2530
24    2533
25    2356
26    2238
27    2416
28    2431
29    2495
30    2536
31    2188
32    2616
33    2201
34    2246
59       2
75       2
76       2
81       1
89       2
95       2
Name: count, dtype: int64

The highest value is 34, and any value above this can be considered an anomaly. This threshold identifies outliers that exceed the expected range, ensuring that the data remains consistent and reliable for analysis.

In [52]:
df_group.loc[df_group["Interest_Rate"] > 34,
        "Interest_Rate"] = np.nan

df_group["Interest_Rate"] = df_group.groupby("Customer_ID")["Interest_Rate"].transform(
    lambda x: x.fillna(method="ffill").fillna(method="bfill"))
<ipython-input-52-fb42c8ef6ab2>:5: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
  lambda x: x.fillna(method="ffill").fillna(method="bfill"))
In [53]:
sns.histplot(df_group['Interest_Rate'], kde=True, bins=30, color='blue')
plt.title('Distribution of Variable: Interest_Rate')
plt.xlabel('Interest_Rate')
plt.ylabel('Count')
plt.grid(True)
plt.show()

plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Interest_Rate'], vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Interest_Rate')
plt.xlabel('Interest_Rate')
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
In [54]:
df_group['Type_of_Loan'].nunique()
print('\n', df_group['Type_of_Loan'].value_counts())
 Type_of_Loan
Not Specified                                                                                                                         2112
Credit-Builder Loan                                                                                                                   1920
Personal Loan                                                                                                                         1908
Debt Consolidation Loan                                                                                                               1896
Student Loan                                                                                                                          1860
                                                                                                                                      ... 
Not Specified, Mortgage Loan, Auto Loan, and Payday Loan                                                                                12
Payday Loan, Mortgage Loan, Debt Consolidation Loan, and Student Loan                                                                   12
Debt Consolidation Loan, Auto Loan, Personal Loan, Debt Consolidation Loan, Student Loan, and Credit-Builder Loan                       12
Student Loan, Auto Loan, Student Loan, Credit-Builder Loan, Home Equity Loan, Debt Consolidation Loan, and Debt Consolidation Loan      12
Personal Loan, Auto Loan, Mortgage Loan, Student Loan, and Student Loan                                                                 12
Name: count, Length: 6260, dtype: int64
In [55]:
df_group[df_group['Type_of_Loan'].isnull() == True][['Customer_ID', 'Type_of_Loan', 'Num_of_Loan_Cleaned']].head(10)
Out[55]:
Customer_ID Type_of_Loan Num_of_Loan_Cleaned
32 CUS_0x1cdb NaN 0.0
33 CUS_0x1cdb NaN 0.0
34 CUS_0x1cdb NaN 0.0
35 CUS_0x1cdb NaN 0.0
36 CUS_0x1cdb NaN 0.0
37 CUS_0x1cdb NaN 0.0
38 CUS_0x1cdb NaN 0.0
39 CUS_0x1cdb NaN 0.0
40 CUS_0x95ee NaN 0.0
41 CUS_0x95ee NaN 0.0
In [56]:
df_group['Type_of_Loan'].fillna('No Loan', inplace=True)
df_group["Type_of_Loan_Cleaned"] = df_group["Type_of_Loan"].str.replace(" and", "", regex=False)
df_group['Type_of_Loan_Cleaned'] = df_group['Type_of_Loan_Cleaned'].str.split(', ')

unique_loans = set()
for loans in df_group['Type_of_Loan_Cleaned']:
    for loan in loans:
        loan_items = [item.strip() for item in loan.split(",")]
        unique_loans.update(loan_items)

unique_loans = sorted(unique_loans)
unique_loans
<ipython-input-56-12ef8776b8f6>:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_group['Type_of_Loan'].fillna('No Loan', inplace=True)
Out[56]:
['Auto Loan',
 'Credit-Builder Loan',
 'Debt Consolidation Loan',
 'Home Equity Loan',
 'Mortgage Loan',
 'No Loan',
 'Not Specified',
 'Payday Loan',
 'Personal Loan',
 'Student Loan']
In [57]:
df_group['Type_of_Loan_Cleaned'] = df_group['Type_of_Loan_Cleaned'].apply(
    lambda x: ', '.join(sorted(set(x)))
)

df_group['Type_of_Loan_Cleaned'].value_counts()
Out[57]:
count
Type_of_Loan_Cleaned
No Loan 17112
Not Specified 2340
Credit-Builder Loan 2232
Personal Loan 2232
Student Loan 2208
... ...
Auto Loan, Credit-Builder Loan, Home Equity Loan, Mortgage Loan, Not Specified, Payday Loan, Personal Loan, Student Loan 24
Auto Loan, Credit-Builder Loan, Debt Consolidation Loan, Home Equity Loan, Mortgage Loan, Not Specified, Payday Loan, Student Loan 24
Auto Loan, Debt Consolidation Loan, Home Equity Loan, Mortgage Loan, Payday Loan, Student Loan 24
Credit-Builder Loan, Debt Consolidation Loan, Home Equity Loan, Mortgage Loan, Not Specified, Payday Loan, Personal Loan, Student Loan 12
Auto Loan, Credit-Builder Loan, Debt Consolidation Loan, Mortgage Loan, Not Specified, Payday Loan, Personal Loan 12

508 rows × 1 columns


In [58]:
loan_counts = []
for loans in unique_loans:
    count_items = len(df_group[df_group['Type_of_Loan_Cleaned'].str.contains(loans, na=False)])
    loan_counts.append({'Type': loans, 'Count': count_items})

loan_counts
Out[58]:
[{'Type': 'Auto Loan', 'Count': 45840},
 {'Type': 'Credit-Builder Loan', 'Count': 47592},
 {'Type': 'Debt Consolidation Loan', 'Count': 46560},
 {'Type': 'Home Equity Loan', 'Count': 47100},
 {'Type': 'Mortgage Loan', 'Count': 47040},
 {'Type': 'No Loan', 'Count': 17112},
 {'Type': 'Not Specified', 'Count': 47520},
 {'Type': 'Payday Loan', 'Count': 47916},
 {'Type': 'Personal Loan', 'Count': 46656},
 {'Type': 'Student Loan', 'Count': 46560}]
In [59]:
loans = pd.DataFrame(loan_counts)
plt.figure(figsize=(12, 6))
plt.bar(loans['Type'], loans['Count'], color='skyblue', edgecolor='black')
plt.title('Frequency of Loan Types', fontsize=16)
plt.xlabel('Loan Type', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
No description has been provided for this image
In [60]:
df_group['Delay_from_due_date'].value_counts()
Out[60]:
count
Delay_from_due_date
15 5355
13 5185
8 5004
14 4949
10 4926
... ...
63 90
65 86
-5 51
66 44
67 29

73 rows × 1 columns


In [61]:
negative_delays = df_group[df_group['Delay_from_due_date'] < 0]
print(negative_delays['Delay_from_due_date'].describe())
count    889.000000
mean      -2.271091
std        1.221830
min       -5.000000
25%       -3.000000
50%       -2.000000
75%       -1.000000
max       -1.000000
Name: Delay_from_due_date, dtype: float64

Negative Delay from Due Date indicates early payment, which doesn't need adjustment as the distribution is normal, except for extreme cases like -30 or more. We'll verify data accuracy by checking differences exceeding 30 days between consecutive delay_from_due_date entries for each Customer_ID.

In [62]:
def check_delays_each_costumer(group):
    return (group['Delay_from_due_date'].diff().abs() > 30).any()

check = df_group.groupby("Customer_ID").apply(check_delays_each_costumer) == True
check.sum()
<ipython-input-62-21e4f303ec75>:4: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  check = df_group.groupby("Customer_ID").apply(check_delays_each_costumer) == True
Out[62]:
0
In [63]:
# Line Plot
plt.figure(figsize=(10, 6))
delay_counts = df_group['Delay_from_due_date'].value_counts().sort_index()
plt.plot(delay_counts.index, delay_counts.values, marker='o', color='green')
plt.title('Line Plot of Variable: Delay_from_due_date', fontsize=14)
plt.xlabel('Delay_from_due_date', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.grid(True)
plt.show()

# Violin Plot
plt.figure(figsize=(8, 6))
sns.violinplot(x=df_group['Delay_from_due_date'], color='purple')
plt.title('Violin Plot of Variable: Delay_from_due_date', fontsize=14)
plt.xlabel('Delay_from_due_date', fontsize=12)
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
In [64]:
df_group['Num_of_Delayed_Payment_Cleaned'] = df_group['Num_of_Delayed_Payment'].str.replace('_', '', regex=False).astype(float)
df_group[df_group['Num_of_Delayed_Payment_Cleaned'] < 10]['Num_of_Delayed_Payment_Cleaned'].value_counts().head()
Out[64]:
count
Num_of_Delayed_Payment_Cleaned
9.0 7421
8.0 7303
7.0 3571
6.0 3435
5.0 3154

In [65]:
df_group[df_group['Num_of_Delayed_Payment_Cleaned'] > 25]['Num_of_Delayed_Payment_Cleaned'].value_counts().head(15)
Out[65]:
count
Num_of_Delayed_Payment_Cleaned
26.0 472
27.0 354
28.0 196
3484.0 4
538.0 3
265.0 3
1150.0 3
1014.0 3
2801.0 3
4211.0 3
1946.0 3
2606.0 3
975.0 2
549.0 2
762.0 2

The highest value is 28, and any value above this can be considered an anomaly. Additionally, values below 0 are also anomalous, as they fall outside the expected range. These thresholds help in identifying outliers for further investigation or cleaning.

In [66]:
df_group.loc[(df_group["Num_of_Delayed_Payment_Cleaned"] < 0) | (df_group["Num_of_Delayed_Payment_Cleaned"] > 28),
        "Num_of_Delayed_Payment_Cleaned"] = np.nan

df_group["Num_of_Delayed_Payment_Cleaned"] = df_group.groupby("Customer_ID")["Num_of_Delayed_Payment_Cleaned"].transform(
    lambda x: x.fillna(method="ffill").fillna(method="bfill"))

# Histogram
plt.hist(df_group["Num_of_Delayed_Payment_Cleaned"], bins=10, edgecolor='black')
plt.title('Distribution of Variable: Num_of_Delayed_Payment_Cleaned')
plt.xlabel('Num_of_Delayed_Payment_Cleaned')
plt.ylabel('Count')
plt.grid(True)
plt.show()

plt.figure(figsize=(5, 5))
plt.boxplot(df_group["Num_of_Delayed_Payment_Cleaned"], vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Num_of_Delayed_Payment_Cleaned')
plt.xlabel('Num_of_Delayed_Payment_Cleaned')
plt.grid(True)
plt.show()
<ipython-input-66-a19df7a99c39>:5: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
  lambda x: x.fillna(method="ffill").fillna(method="bfill"))
No description has been provided for this image
No description has been provided for this image
In [68]:
df_group['Changed_Credit_Limit'].value_counts().head(50)
Out[68]:
count
Changed_Credit_Limit
_ 3150
11.5 197
11.32 189
8.22 189
7.35 181
10.06 178
8.23 169
7.69 166
7.01 165
11.49 164
7.33 163
9.25 162
3.93 161
1.63 159
8.99 156
8.3 156
7.63 155
8.54 153
8.82 152
7.23 152
9.58 151
11.73 151
10.3 151
8.76 149
9.13 149
11.63 148
11.78 148
11.95 148
9.88 146
8.04 146
9.2 145
7.06 144
7.66 143
10.54 143
8.34 141
7.64 140
4.92 140
4.86 140
9.09 139
8.74 138
9.18 138
1.59 138
5.99 137
9.97 137
7.91 137
0.57 136
8.56 136
11.51 136
8.67 136
10.64 135

In [69]:
df_group.loc[(df_group["Changed_Credit_Limit"] == "_"), "Changed_Credit_Limit"] = np.nan

df_group['Changed_Credit_Limit_Cleaned'] = pd.to_numeric(
    df_group['Changed_Credit_Limit'].str.replace('_', '', regex=False),
    errors='coerce'
)

df_group["Changed_Credit_Limit_Cleaned"] = df_group.groupby("Customer_ID")["Changed_Credit_Limit_Cleaned"].transform(
    lambda x: x.fillna(method="ffill").fillna(method="bfill"))

df_group['Changed_Credit_Limit_Cleaned'] = df_group['Changed_Credit_Limit_Cleaned'].round(3)
df_group['Changed_Credit_Limit_Cleaned'].value_counts()
<ipython-input-69-c3734b7ef243>:9: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
  lambda x: x.fillna(method="ffill").fillna(method="bfill"))
Out[69]:
count
Changed_Credit_Limit_Cleaned
11.50 198
8.22 196
11.32 190
7.35 186
7.69 181
... ...
32.11 1
24.27 1
31.15 1
29.20 1
29.17 1

3770 rows × 1 columns


In [70]:
# Histogram
sns.histplot(df_group['Changed_Credit_Limit_Cleaned'], kde=True, bins=30, color='lime')
plt.title('Distribution Distribution of Variable: Changed_Credit_Limit')
plt.xlabel('Changed_Credit_Limit')
plt.ylabel('Count')
plt.grid(True)
plt.show()

plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Changed_Credit_Limit_Cleaned'], vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Changed_Credit_Limit')
plt.xlabel('Changed_Credit_Limit')
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
In [71]:
plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Num_Credit_Inquiries'].dropna(), vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Num_Credit_Inquiries')
plt.xlabel('Num_Credit_Inquiries')
plt.grid(True)
plt.show()
No description has been provided for this image
In [72]:
df_group[df_group['Num_Credit_Inquiries'] > 10]['Num_Credit_Inquiries'].value_counts().head(10)
Out[72]:
count
Num_Credit_Inquiries
11.0 8047
12.0 7156
13.0 3545
14.0 2433
15.0 1871
16.0 1107
17.0 672
769.0 5
1460.0 5
1114.0 5

The highest value is 17, and any value above this can be considered an anomaly. This threshold helps identify outliers that deviate significantly from the expected range.

In [73]:
df_group.loc[(df_group["Num_Credit_Inquiries"] > 17), "Num_Credit_Inquiries"] = np.nan

df_group["Num_Credit_Inquiries"] = df_group.groupby("Customer_ID")["Num_Credit_Inquiries"].transform(
    lambda x: x.fillna(method="ffill").fillna(method="bfill"))

sns.histplot(df_group['Num_Credit_Inquiries'], kde=True, bins=30, color='lime')
plt.title('Distribution Distribution of Variable: Num_Credit_Inquiries')
plt.xlabel('Num_Credit_Inquiries')
plt.ylabel('Count')
plt.grid(True)
plt.show()

plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Num_Credit_Inquiries'], vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Num_Credit_Inquiries')
plt.xlabel('Num_Credit_Inquiries')
plt.grid(True)
plt.show()
<ipython-input-73-9d75bb1fb014>:4: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
  lambda x: x.fillna(method="ffill").fillna(method="bfill"))
No description has been provided for this image
No description has been provided for this image
In [74]:
plt.figure(figsize=(8, 6))
sns.countplot(x='Credit_Mix', data=df_group, palette='pastel')
plt.title('Distribution of Target Variable: Credit_Mix')
plt.xlabel('Credit_Mix')
plt.ylabel('Count')
plt.show()
<ipython-input-74-613dbba4248f>:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x='Credit_Mix', data=df_group, palette='pastel')
No description has been provided for this image
In [75]:
df_group.loc[(df_group["Credit_Mix"] == '_'), "Credit_Mix"] = np.nan

df_group["Credit_Mix"] = df_group.groupby("Customer_ID")["Credit_Mix"].transform(
    lambda x: x.fillna(x.mode()[0]))

plt.figure(figsize=(8, 6))
sns.countplot(x='Credit_Mix', data=df_group, palette='pastel')
plt.title('Distribution of Target Variable: Credit_Mix')
plt.xlabel('Credit_Mix')
plt.ylabel('Count')
plt.show()
<ipython-input-75-9698a37e768e>:7: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x='Credit_Mix', data=df_group, palette='pastel')
No description has been provided for this image
In [76]:
df_group['Outstanding_Debt_Cleaned'] = pd.to_numeric(
    df_group['Outstanding_Debt'].str.replace('_', '', regex=False),
    errors='coerce'
)

# Histogram
sns.histplot(df_group['Outstanding_Debt_Cleaned'], kde=True, bins=30, color='yellow')
plt.title('Distribution of Variable: Outstanding_Debt')
plt.xlabel('Outstanding_Debt')
plt.ylabel('Count')
plt.grid(True)
plt.show()

plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Outstanding_Debt_Cleaned'], vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Outstanding_Debt')
plt.xlabel('Outstanding_Debt')
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
In [77]:
# Histogram
sns.histplot(df_group['Credit_Utilization_Ratio'], kde=True, bins=30, color='blue')
plt.title('Distribution Credit_Utilization_Ratio in Months')
plt.xlabel('Credit_Utilization_Ratio')
plt.ylabel('Count')
plt.grid(True)
plt.show()

plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Credit_Utilization_Ratio'], vert=False, patch_artist=True)
plt.title('Boxplot Distribution of Variable: Credit_Utilization_Ratio')
plt.xlabel('Credit_Utilization_Ratio')
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
In [78]:
def convert_to_months(value):
    if pd.isnull(value):
        return np.nan
    match = re.match(r'(\d+) Years and (\d+) Months', value)
    if match:
        years, months = int(match.group(1)), int(match.group(2))
        return years * 12 + months
    return np.nan

df_group['Credit_History_Age_in_Months'] = df_group['Credit_History_Age'].apply(convert_to_months)
In [79]:
df_group["Credit_History_Age_in_Months"].value_counts()
Out[79]:
count
Credit_History_Age_in_Months
215.0 628
220.0 621
219.0 617
237.0 615
218.0 615
... ...
3.0 20
2.0 15
407.0 15
408.0 14
1.0 2

408 rows × 1 columns


In [80]:
df_group.groupby("Customer_ID")["Credit_History_Age_in_Months"].value_counts(
    dropna=False).head(20)
Out[80]:
count
Customer_ID Credit_History_Age_in_Months
CUS_0x1000 122.0 1
123.0 1
124.0 1
125.0 1
126.0 1
127.0 1
128.0 1
129.0 1
130.0 1
131.0 1
132.0 1
133.0 1
CUS_0x1009 365.0 1
366.0 1
367.0 1
369.0 1
370.0 1
371.0 1
372.0 1
373.0 1

In [81]:
def fill_nan_credit(df):
    # Forward fill: Isi NaN dengan nilai sebelumnya + 1
    df['Credit_History_Age_in_Months'] = df['Credit_History_Age_in_Months'].fillna(method='ffill') + 1
    # Backward fill: Isi NaN yang tersisa dengan nilai berikutnya - 1
    df['Credit_History_Age_in_Months'] = df['Credit_History_Age_in_Months'].fillna(method='bfill') - 1
    return df

df_group = df_group.groupby("Customer_ID", group_keys=False).apply(fill_nan_credit)
df_group['Credit_History_Age_in_Months'].isnull().sum()
<ipython-input-81-472e60c0e87f>:3: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
  df['Credit_History_Age_in_Months'] = df['Credit_History_Age_in_Months'].fillna(method='ffill') + 1
<ipython-input-81-472e60c0e87f>:5: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
  df['Credit_History_Age_in_Months'] = df['Credit_History_Age_in_Months'].fillna(method='bfill') - 1
<ipython-input-81-472e60c0e87f>:8: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  df_group = df_group.groupby("Customer_ID", group_keys=False).apply(fill_nan_credit)
Out[81]:
0
In [82]:
plt.figure(figsize=(10, 5))
sns.histplot(df_group['Credit_History_Age_in_Months'], kde=True, bins=30, color='red')
plt.title('Distribution Credit History Age in Months')
plt.xlabel('Total Months')
plt.ylabel('Count')
plt.grid(True)
plt.show()

plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Credit_History_Age_in_Months'], vert=False, patch_artist=True)
plt.title('Boxplot Credit History Age In Months')
plt.xlabel('Total Months')
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
In [83]:
plt.figure(figsize=(8, 6))
sns.countplot(x='Payment_of_Min_Amount', data=df_group, palette='pastel')
plt.title('Distribution of Target Variable: Payment_of_Min_Amount')
plt.xlabel('Payment_of_Min_Amount')
plt.ylabel('Count')
plt.show()
<ipython-input-83-2601dd4ddbec>:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x='Payment_of_Min_Amount', data=df_group, palette='pastel')
No description has been provided for this image
In [84]:
df_group.groupby("Customer_ID")["Payment_of_Min_Amount"].value_counts(
    dropna=False).head(20)
Out[84]:
count
Customer_ID Payment_of_Min_Amount
CUS_0x1000 Yes 12
CUS_0x1009 Yes 11
NM 1
CUS_0x100b No 11
NM 1
CUS_0x1011 Yes 11
NM 1
CUS_0x1013 No 11
NM 1
CUS_0x1015 Yes 10
NM 2
CUS_0x1018 Yes 10
NM 2
CUS_0x1026 No 11
NM 1
CUS_0x102d No 10
NM 2
CUS_0x102e Yes 11
NM 1
CUS_0x1032 Yes 11

In [85]:
def replace_nm_with_majority(group):
    majority_value = group['Payment_of_Min_Amount'][group['Payment_of_Min_Amount'] != 'NM'].mode()[0]
    group['Payment_of_Min_Amount'] = group['Payment_of_Min_Amount'].replace('NM', majority_value)
    return group

df_group = df_group.groupby('Customer_ID', group_keys=False).apply(replace_nm_with_majority)
df_group["Payment_of_Min_Amount"].isnull().sum()
<ipython-input-85-0baa9dd83691>:6: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  df_group = df_group.groupby('Customer_ID', group_keys=False).apply(replace_nm_with_majority)
Out[85]:
0
In [86]:
plt.figure(figsize=(8, 6))
sns.countplot(x='Payment_of_Min_Amount', data=df_group, palette='pastel')
plt.title('Distribution of Target Variable: Payment_of_Min_Amount')
plt.xlabel('Payment_of_Min_Amount')
plt.ylabel('Count')
plt.show()
<ipython-input-86-2601dd4ddbec>:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x='Payment_of_Min_Amount', data=df_group, palette='pastel')
No description has been provided for this image
In [87]:
plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Total_EMI_per_month'], vert=False, patch_artist=True)
plt.title('Boxplot Total_EMI_per_month')
plt.xlabel('Total_EMI_per_month')
plt.grid(True)
plt.show()
No description has been provided for this image
In [88]:
df_group['Total_EMI_per_month'].value_counts()
Out[88]:
count
Total_EMI_per_month
0.000000 15615
49.574949 12
16.528703 12
64.443403 12
331.719510 12
... ...
10404.000000 1
15457.000000 1
59578.000000 1
43183.000000 1
33013.000000 1

16960 rows × 1 columns


In [89]:
df_group.loc[df_group["Total_EMI_per_month"] > df_group["Monthly_Inhand_Salary"] * 0.3]["Total_EMI_per_month"].value_counts()
Out[89]:
count
Total_EMI_per_month
10335.0 4
17407.0 3
29766.0 3
26590.0 3
71993.0 3
... ...
53062.0 1
58509.0 1
24556.0 1
78530.0 1
33013.0 1

4633 rows × 1 columns


In [90]:
df_group.loc[df_group["Total_EMI_per_month"] > df_group["Monthly_Inhand_Salary"] * 0.3, "Total_EMI_per_month"] = np.nan

df_group["Total_EMI_per_month"] = df_group.groupby(
    "Customer_ID")["Total_EMI_per_month"].transform(
        lambda x: x.fillna(x.mode()[0]) if not x.mode().empty else x)

def fix_inconsistent_emi(group):
    # Hitung nilai modus (mode) untuk setiap grup customer_id
    mode_value = group['Total_EMI_per_month'].mode()
    if not mode_value.empty:
        # Ganti nilai yang hanya muncul 1 kali dan bukan nilai pertama atau terakhir
        group['Total_EMI_per_month'] = group['Total_EMI_per_month'].apply(
            lambda x: mode_value[0] if group['Total_EMI_per_month'].value_counts()[x] == 1 and
            x != group['Total_EMI_per_month'].iloc[0] and x != group['Total_EMI_per_month'].iloc[-1] else x
        )
    return group

df_group = df_group.groupby("Customer_ID", group_keys=False).apply(fix_inconsistent_emi)

plt.figure(figsize=(10, 5))
sns.histplot(df_group["Total_EMI_per_month"], kde=True, bins=30, color='skyblue')
plt.title('Distribution Amount Invested Monthly')
plt.xlabel('Amount Invested Monthly')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

plt.figure(figsize=(5, 5))
plt.boxplot(df_group['Total_EMI_per_month'], vert=False, patch_artist=True)
plt.title('Boxplot Total_EMI_per_month')
plt.xlabel('Total_EMI_per_month')
plt.grid(True)
plt.show()
<ipython-input-90-f10def76ad72>:18: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  df_group = df_group.groupby("Customer_ID", group_keys=False).apply(fix_inconsistent_emi)
No description has been provided for this image
No description has been provided for this image
In [91]:
df_group['Amount_invested_monthly'].value_counts(dropna=False)
Out[91]:
count
Amount_invested_monthly
NaN 6750
__10000__ 6480
0.0 275
80.41529543900253 1
16.53218878920387 1
... ...
209.17274569312266 1
333.0148085469461 1
274.68712877851107 1
220.58121173366908 1
220.45787812168732 1

136498 rows × 1 columns


In [92]:
df_group['Amount_invested_monthly_cleaned'] = (
    df_group['Amount_invested_monthly']
    .str.replace('__', '', regex=False)
    .astype(float)
)

df_group["Amount_invested_monthly_cleaned"] = df_group.groupby("Customer_ID")["Amount_invested_monthly_cleaned"].transform(
        lambda x: x.fillna(method="ffill").fillna(method="bfill"))

df_group['Amount_invested_monthly_cleaned'] = df_group['Amount_invested_monthly_cleaned'].round(3)

# Histogram
plt.figure(figsize=(10, 5))
sns.histplot(df_group["Amount_invested_monthly_cleaned"], kde=True, bins=30, color='skyblue')
plt.title('Distribution Amount Invested Monthly')
plt.xlabel('Amount Invested Monthly')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# Boxplot
plt.figure(figsize=(5, 5))
plt.boxplot(df_group["Amount_invested_monthly_cleaned"], vert=False, patch_artist=True)
plt.title('Boxplot Amount Invested Monthly')
plt.xlabel('Amount')
plt.grid(True)
plt.show()
<ipython-input-92-1834a8f96089>:8: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
  lambda x: x.fillna(method="ffill").fillna(method="bfill"))
No description has been provided for this image
No description has been provided for this image
In [93]:
plt.figure(figsize=(10, 6))
sns.countplot(y='Payment_Behaviour', data=df_group, palette='pastel', orient='h')
plt.title('Distribution of Target Variable: Payment_Behaviour')
plt.xlabel('Count')
plt.show()
<ipython-input-93-bb439be2e0aa>:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(y='Payment_Behaviour', data=df_group, palette='pastel', orient='h')
No description has been provided for this image
In [94]:
df_group.groupby(["Customer_ID"])["Payment_Behaviour"].nunique().head(20)
Out[94]:
Payment_Behaviour
Customer_ID
CUS_0x1000 6
CUS_0x1009 5
CUS_0x100b 7
CUS_0x1011 4
CUS_0x1013 5
CUS_0x1015 7
CUS_0x1018 5
CUS_0x1026 5
CUS_0x102d 6
CUS_0x102e 6
CUS_0x1032 6
CUS_0x1037 4
CUS_0x1038 7
CUS_0x103e 4
CUS_0x1041 5
CUS_0x1044 6
CUS_0x1048 7
CUS_0x104a 4
CUS_0x104e 6
CUS_0x104f 4

In [95]:
df_group.loc[(df_group["Payment_Behaviour"] == "!@9#%8"), "Payment_Behaviour"] = np.nan

df_group["Payment_Behaviour"] = df_group.groupby("Customer_ID")["Payment_Behaviour"].transform(
        lambda x: x.fillna(method="ffill").fillna(method="bfill"))

plt.figure(figsize=(10, 6))
sns.countplot(y='Payment_Behaviour', data=df_group, palette='pastel', orient='h')
plt.title('Distribution of Target Variable: Payment_Behaviour')
plt.xlabel('Count')
plt.show()
<ipython-input-95-25f2ac98a2ae>:4: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
  lambda x: x.fillna(method="ffill").fillna(method="bfill"))
<ipython-input-95-25f2ac98a2ae>:7: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(y='Payment_Behaviour', data=df_group, palette='pastel', orient='h')
No description has been provided for this image
In [96]:
df_group['Monthly_Balance'].value_counts(dropna=False)
Out[96]:
count
Monthly_Balance
NaN 1762
__-333333333333333333333333333__ 15
600.1125089726384 1
120.54247208897846 1
784.0174708573453 1
... ...
279.9825060023594 1
260.67157274114965 1
250.4915113003876 1
243.8753153006728 1
360.37968260123847 1

148225 rows × 1 columns


In [97]:
df_group[df_group['Monthly_Balance'].str.contains('__', na=False)]['Monthly_Balance'].value_counts()
Out[97]:
count
Monthly_Balance
__-333333333333333333333333333__ 15

In [98]:
df_group['Monthly_Balance_Cleaned'] = df_group['Monthly_Balance'].replace(
    "__-333333333333333333333333333__", 0)

df_group['Monthly_Balance_Cleaned'] = pd.to_numeric(df_group['Monthly_Balance_Cleaned'], errors='coerce')
df_group['Monthly_Balance_Cleaned'] = df_group['Monthly_Balance_Cleaned'].round(3)

customer = df_group[df_group['Monthly_Balance_Cleaned'] == 0]
df_group[df_group['Customer_ID'].isin(customer['Customer_ID'])][['Customer_ID', 'Monthly_Balance_Cleaned']].head(10)
Out[98]:
Customer_ID Monthly_Balance_Cleaned
5544 CUS_0x9885 423.397
5545 CUS_0x9885 0.000
5546 CUS_0x9885 278.412
5547 CUS_0x9885 420.557
5548 CUS_0x9885 71.288
5549 CUS_0x9885 383.284
5550 CUS_0x9885 229.007
5551 CUS_0x9885 374.031
22720 CUS_0x4379 317.267
22721 CUS_0x4379 290.461
In [99]:
df_group.loc[df_group["Monthly_Balance_Cleaned"] == 0, "Monthly_Balance_Cleaned"] = df_group.loc[df_group["Monthly_Balance_Cleaned"] == 0, "Customer_ID"].map(
        df_group[df_group["Monthly_Balance_Cleaned"] != 0].groupby(
            "Customer_ID")["Monthly_Balance_Cleaned"].mean())

df_group["Monthly_Balance_Cleaned"] = df_group.groupby(
    "Customer_ID")["Monthly_Balance_Cleaned"].transform(lambda x: x.fillna(x.mean()))

plt.figure(figsize=(10, 5))
sns.histplot(df_group["Monthly_Balance_Cleaned"], kde=True, bins=30, color='green')
plt.title('Distribusi Monthly Balance')
plt.xlabel('Monthly Balance')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

plt.figure(figsize=(5, 5))
plt.boxplot(df_group["Monthly_Balance_Cleaned"], vert=False, patch_artist=True)
plt.title('Boxplot Monthly Balance')
plt.xlabel('Monthly Balance')
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
In [100]:
df_group = df_group.drop(columns=['Age','SSN', 'Is_Valid_SSN', 'Annual_Income', 'Num_of_Loan', 'Type_of_Loan', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Outstanding_Debt', 'Credit_History_Age', 'Amount_invested_monthly', 'Monthly_Balance'])
df_group.info()
<class 'pandas.core.frame.DataFrame'>
Index: 150000 entries, 0 to 149999
Data columns (total 29 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   ID                               150000 non-null  object 
 1   Customer_ID                      150000 non-null  object 
 2   Month                            150000 non-null  object 
 3   Name                             150000 non-null  object 
 4   Occupation                       150000 non-null  object 
 5   Monthly_Inhand_Salary            150000 non-null  float64
 6   Num_Bank_Accounts                150000 non-null  float64
 7   Num_Credit_Card                  150000 non-null  float64
 8   Interest_Rate                    150000 non-null  float64
 9   Delay_from_due_date              150000 non-null  int64  
 10  Num_Credit_Inquiries             150000 non-null  float64
 11  Credit_Mix                       150000 non-null  object 
 12  Credit_Utilization_Ratio         150000 non-null  float64
 13  Payment_of_Min_Amount            150000 non-null  object 
 14  Total_EMI_per_month              150000 non-null  float64
 15  Payment_Behaviour                150000 non-null  object 
 16  Credit_Score                     100000 non-null  object 
 17  is_train                         150000 non-null  int64  
 18  Age_Cleaned                      150000 non-null  int64  
 19  SSN_Cleaned                      150000 non-null  object 
 20  Annual_Income_Cleaned            150000 non-null  float64
 21  Num_of_Loan_Cleaned              150000 non-null  float64
 22  Type_of_Loan_Cleaned             150000 non-null  object 
 23  Num_of_Delayed_Payment_Cleaned   150000 non-null  float64
 24  Changed_Credit_Limit_Cleaned     150000 non-null  float64
 25  Outstanding_Debt_Cleaned         150000 non-null  float64
 26  Credit_History_Age_in_Months     150000 non-null  float64
 27  Amount_invested_monthly_cleaned  150000 non-null  float64
 28  Monthly_Balance_Cleaned          150000 non-null  float64
dtypes: float64(15), int64(3), object(11)
memory usage: 38.4+ MB
In [101]:
df_group
Out[101]:
ID Customer_ID Month Name Occupation Monthly_Inhand_Salary Num_Bank_Accounts Num_Credit_Card Interest_Rate Delay_from_due_date ... SSN_Cleaned Annual_Income_Cleaned Num_of_Loan_Cleaned Type_of_Loan_Cleaned Num_of_Delayed_Payment_Cleaned Changed_Credit_Limit_Cleaned Outstanding_Debt_Cleaned Credit_History_Age_in_Months Amount_invested_monthly_cleaned Monthly_Balance_Cleaned
0 0x1602 CUS_0xd40 January Aaron Maashoh Scientist 1824.843333 3.0 4.0 3.0 3 ... 821-00-0265 19114.12 4.0 Auto Loan, Credit-Builder Loan, Home Equity Lo... 7.0 11.27 809.98 265.0 80.415 312.494
1 0x1603 CUS_0xd40 February Aaron Maashoh Scientist 1824.843333 3.0 4.0 3.0 -1 ... 821-00-0265 19114.12 4.0 Auto Loan, Credit-Builder Loan, Home Equity Lo... 7.0 11.27 809.98 265.0 118.280 284.629
2 0x1604 CUS_0xd40 March Aaron Maashoh Scientist 1824.843333 3.0 4.0 3.0 3 ... 821-00-0265 19114.12 4.0 Auto Loan, Credit-Builder Loan, Home Equity Lo... 7.0 11.27 809.98 267.0 81.700 331.210
3 0x1605 CUS_0xd40 April Aaron Maashoh Scientist 1824.843333 3.0 4.0 3.0 5 ... 821-00-0265 19114.12 4.0 Auto Loan, Credit-Builder Loan, Home Equity Lo... 4.0 6.27 809.98 268.0 199.458 223.451
4 0x1606 CUS_0xd40 May Aaron Maashoh Scientist 1824.843333 3.0 4.0 3.0 6 ... 821-00-0265 19114.12 4.0 Auto Loan, Credit-Builder Loan, Home Equity Lo... 4.0 11.27 809.98 269.0 41.420 341.489
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
149995 0x25fe5 CUS_0x8600 December Sarah McBridec Architect 1929.906667 10.0 8.0 29.0 33 ... 031-35-0942 20002.88 5.0 Auto Loan, Mortgage Loan, Personal Loan, Stude... 25.0 18.31 3571.70 78.0 146.486 275.540
149996 0x25fee CUS_0x942c September Nicks Mechanic 3359.415833 4.0 6.0 7.0 20 ... 078-73-5990 39628.99 2.0 Auto Loan, Student Loan 6.0 11.50 502.38 383.0 181.443 409.395
149997 0x25fef CUS_0x942c October Nicks Mechanic 3359.415833 4.0 6.0 7.0 23 ... 078-73-5990 39628.99 2.0 Auto Loan, Student Loan 5.0 13.50 502.38 384.0 10000.000 349.726
149998 0x25ff0 CUS_0x942c November Nicks Mechanic 3359.415833 4.0 6.0 7.0 21 ... 078-73-5990 39628.99 2.0 Auto Loan, Student Loan 6.0 11.50 502.38 385.0 97.599 463.239
149999 0x25ff1 CUS_0x942c December Nicks Mechanic 3359.415833 4.0 6.0 7.0 22 ... 078-73-5990 39628.99 2.0 Auto Loan, Student Loan 5.0 11.50 502.38 386.0 220.458 360.380

150000 rows × 29 columns

Exploratory Data Analysis After Cleaning¶

In [102]:
plt.figure(figsize=(20, 40))

numeric = df_group.select_dtypes(include=['number']).columns # Select numerical columns


for i, col in enumerate(numeric):
    plt.subplot(9, 3, i + 1)
    sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1')
    plt.title(f'Boxplot of {col} by Credit Score', fontsize=13, pad=10)
    plt.xlabel('Credit Score', fontsize=13)
    plt.ylabel(col, fontsize=13)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)

    min_val = df_group[col].min()
    max_val = df_group[col].max()
    margin = (max_val - min_val) * 0.1
    plt.ylim(min_val - margin, max_val + margin)

plt.subplots_adjust(wspace=0.4, hspace=0.7)
plt.tight_layout()
plt.show()
<ipython-input-102-a2600b2eb564>:8: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1')
<ipython-input-102-a2600b2eb564>:8: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1')
<ipython-input-102-a2600b2eb564>:8: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1')
<ipython-input-102-a2600b2eb564>:8: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1')
<ipython-input-102-a2600b2eb564>:8: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1')
<ipython-input-102-a2600b2eb564>:8: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1')
<ipython-input-102-a2600b2eb564>:8: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1')
<ipython-input-102-a2600b2eb564>:8: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1')
<ipython-input-102-a2600b2eb564>:8: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1')
<ipython-input-102-a2600b2eb564>:8: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1')
<ipython-input-102-a2600b2eb564>:8: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1')
<ipython-input-102-a2600b2eb564>:8: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1')
<ipython-input-102-a2600b2eb564>:8: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1')
<ipython-input-102-a2600b2eb564>:8: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1')
<ipython-input-102-a2600b2eb564>:8: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1')
<ipython-input-102-a2600b2eb564>:8: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1')
<ipython-input-102-a2600b2eb564>:8: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1')
<ipython-input-102-a2600b2eb564>:8: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Credit_Score', y=col, data=df_group, palette='Set1')
No description has been provided for this image
In [103]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

numeric = df_group.select_dtypes(exclude=object).columns

fig, ax = plt.subplots(figsize=(20, 35))
fig.patch.set_facecolor('white')

mpl.rcParams['font.family'] = 'DejaVu Sans'
mpl.rcParams['font.size'] = 12
colors = sns.color_palette("tab10", n_colors=len(numeric)).as_hex()

for i, col in enumerate(numeric):
    plt.subplot(9, 3, i + 1)
    sns.kdeplot(x=df_group[col], color=colors[i], fill=True)
    plt.title(f'Distribution of {col}', fontsize=10)
    plt.grid(visible=False)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [104]:
import numpy as np

plt.figure(figsize=(10, len(numeric) * 4))
for i, col in enumerate(numeric):
    try:
        if df_group[col].nunique() > 1:  # Ensure column has sufficient variance
            plt.subplot(len(numeric), 1, i + 1)
            sns.histplot(
                data=df_group,
                x=col,
                hue='Credit_Score',
                kde=True,
                bins=30,
                palette='Set2'
            )
            plt.title(f'Distribution of {col} by Credit Score', fontsize=13, pad=10)
            plt.xlabel(col, fontsize=13)
            plt.ylabel('Frequency', fontsize=13)
            plt.xticks(fontsize=12)
            plt.yticks(fontsize=12)
        else:
            print(f"Skipping column '{col}' due to insufficient variance.")
    except np.linalg.LinAlgError:
        print(f"KDE failed for column '{col}', disabling KDE.")
        plt.subplot(len(numeric), 1, i + 1)
        sns.histplot(
            data=df_group,
            x=col,
            hue='Credit_Score',
            kde=False,  # Disable KDE if it fails
            bins=30,
            palette='Set2'
        )
        plt.title(f'Distribution of {col} by Credit Score (KDE Disabled)', fontsize=13, pad=10)
        plt.xlabel(col, fontsize=13)
        plt.ylabel('Frequency', fontsize=13)
        plt.xticks(fontsize=12)
        plt.yticks(fontsize=12)

plt.subplots_adjust(wspace=0.4, hspace=0.7)
plt.tight_layout()
plt.show()
KDE failed for column 'is_train', disabling KDE.
No description has been provided for this image
In [105]:
plt.figure(figsize=(17,10))
cor = df_group[numeric].corr(method="kendall")
sns.heatmap(cor, annot=True)
plt.title('"Correlation of Numeric Features"')
plt.show()
No description has been provided for this image

Feature Selection¶

In [106]:
object_columns = df_group.select_dtypes(include=['object']).columns
object_columns = object_columns[object_columns != 'test_yes']
print(object_columns)
Index(['ID', 'Customer_ID', 'Month', 'Name', 'Occupation', 'Credit_Mix',
       'Payment_of_Min_Amount', 'Payment_Behaviour', 'Credit_Score',
       'SSN_Cleaned', 'Type_of_Loan_Cleaned'],
      dtype='object')
In [112]:
for col in object_columns:
    unique_values = df_group[col].nunique()
    print(f"Different and unique columns {col}: ({unique_values})")
Different and unique columns ID: (150000)
Different and unique columns Customer_ID: (12500)
Different and unique columns Month: (12)
Different and unique columns Name: (10139)
Different and unique columns Occupation: (15)
Different and unique columns Credit_Mix: (3)
Different and unique columns Payment_of_Min_Amount: (2)
Different and unique columns Payment_Behaviour: (6)
Different and unique columns Credit_Score: (3)
Different and unique columns SSN_Cleaned: (12500)
Different and unique columns Type_of_Loan_Cleaned: (508)
In [113]:
month_to_number = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5,
    'June': 6, 'July': 7, 'August': 8, 'September': 9, 'October': 10,
    'November': 11, 'December': 12
}

df_group['Month_Number'] = df_group['Month'].map(month_to_number)
In [114]:
Credit_Mix_to_Score = {
    'Good': 2,
    'Standard': 1,
    'Bad': 0
}

df_group['Credit_Mix_Score'] = df_group['Credit_Mix'].map(Credit_Mix_to_Score)
In [115]:
Payment_of_Min_Amount_to_bolean = {
    'Yes': 1,
    'No': 0
}

df_group['Payment_of_Min_Amount_Yes'] = df_group['Payment_of_Min_Amount'].map(Payment_of_Min_Amount_to_bolean)
In [116]:
Payment_Behaviour_to_Score = {
    'Low_spent_Small_value_payments': 1,
    'Low_spent_Medium_value_payments': 2,
    'Low_spent_Large_value_payments': 3,
    'High_spent_Small_value_payments': 4,
    'High_spent_Medium_value_payments': 5,
    'High_spent_Large_value_payments': 6
}

df_group['Payment_Behaviour_Score'] = df_group['Payment_Behaviour'].map(Payment_Behaviour_to_Score)
In [117]:
df_group['Credit_Score'].unique()
Out[117]:
array(['Good', 'Standard', 'Poor', nan], dtype=object)
In [118]:
Credit_Score_to_Number = {
    'Good': 2,
    'Standard': 1,
    'Poor': 0
}

df_group['Credit_Score'] = df_group['Credit_Score'].map(Credit_Score_to_Number)
In [119]:
plt.figure(figsize=(17,10))
cor = df_group[numeric].corr(method="kendall")
sns.heatmap(cor, annot=True)
plt.title('Correlation of Numeric Features After Feature Engineering')
plt.show()
No description has been provided for this image
In [120]:
loan = df_group
loan['Type_of_Loan_Cleaned'] = loan['Type_of_Loan_Cleaned'].str.strip()
loan['Type_of_Loan_Cleaned'] = loan['Type_of_Loan_Cleaned'].str.replace(' ', '_', regex=True)
loan['Type_of_Loan_Cleaned'] = loan['Type_of_Loan_Cleaned'].str.replace('-', '_', regex=True)

loan = loan['Type_of_Loan_Cleaned'].str.get_dummies(sep=',_')
df_group = pd.concat([df_group, loan], axis=1)
In [121]:
df_group = pd.get_dummies(df_group, columns=['Occupation'])
In [122]:
df_group = df_group.select_dtypes(exclude=['object'])
df_group = df_group.drop(columns=['Month_Number'])
df_group
Out[122]:
Monthly_Inhand_Salary Num_Bank_Accounts Num_Credit_Card Interest_Rate Delay_from_due_date Num_Credit_Inquiries Credit_Utilization_Ratio Total_EMI_per_month Credit_Score is_train ... Occupation_Entrepreneur Occupation_Journalist Occupation_Lawyer Occupation_Manager Occupation_Mechanic Occupation_Media_Manager Occupation_Musician Occupation_Scientist Occupation_Teacher Occupation_Writer
0 1824.843333 3.0 4.0 3.0 3 4.0 26.822620 49.574949 2.0 1 ... False False False False False False False True False False
1 1824.843333 3.0 4.0 3.0 -1 4.0 31.944960 49.574949 2.0 1 ... False False False False False False False True False False
2 1824.843333 3.0 4.0 3.0 3 4.0 28.609352 49.574949 2.0 1 ... False False False False False False False True False False
3 1824.843333 3.0 4.0 3.0 5 4.0 31.377862 49.574949 2.0 1 ... False False False False False False False True False False
4 1824.843333 3.0 4.0 3.0 6 4.0 24.797347 49.574949 2.0 1 ... False False False False False False False True False False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
149995 1929.906667 10.0 8.0 29.0 33 12.0 34.780553 60.964772 NaN 0 ... False False False False False False False False False False
149996 3359.415833 4.0 6.0 7.0 20 7.0 27.758522 35.104023 NaN 0 ... False False False False True False False False False False
149997 3359.415833 4.0 6.0 7.0 23 7.0 36.858542 35.104023 NaN 0 ... False False False False True False False False False False
149998 3359.415833 4.0 6.0 7.0 21 7.0 39.139840 35.104023 NaN 0 ... False False False False True False False False False False
149999 3359.415833 4.0 6.0 7.0 22 7.0 34.108530 35.104023 NaN 0 ... False False False False True False False False False False

150000 rows × 47 columns

In [123]:
data_train = df_group[df_group['is_train'] == 0].drop(columns=['is_train'])
data_test = df_group[df_group['is_train'] == 1].drop(columns=['is_train'])
In [124]:
Credit_Score = data_train.pop('Credit_Score')
data_train['Credit_Score'] = Credit_Score

data_test = data_test.drop(columns=['Credit_Score'])
In [125]:
df_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 29 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ID                        100000 non-null  object 
 1   Customer_ID               100000 non-null  object 
 2   Month                     100000 non-null  object 
 3   Name                      90015 non-null   object 
 4   Age                       100000 non-null  object 
 5   SSN                       100000 non-null  object 
 6   Occupation                100000 non-null  object 
 7   Annual_Income             100000 non-null  object 
 8   Monthly_Inhand_Salary     84998 non-null   float64
 9   Num_Bank_Accounts         100000 non-null  int64  
 10  Num_Credit_Card           100000 non-null  int64  
 11  Interest_Rate             100000 non-null  int64  
 12  Num_of_Loan               100000 non-null  object 
 13  Type_of_Loan              88592 non-null   object 
 14  Delay_from_due_date       100000 non-null  int64  
 15  Num_of_Delayed_Payment    92998 non-null   object 
 16  Changed_Credit_Limit      100000 non-null  object 
 17  Num_Credit_Inquiries      98035 non-null   float64
 18  Credit_Mix                100000 non-null  object 
 19  Outstanding_Debt          100000 non-null  object 
 20  Credit_Utilization_Ratio  100000 non-null  float64
 21  Credit_History_Age        90970 non-null   object 
 22  Payment_of_Min_Amount     100000 non-null  object 
 23  Total_EMI_per_month       100000 non-null  float64
 24  Amount_invested_monthly   95521 non-null   object 
 25  Payment_Behaviour         100000 non-null  object 
 26  Monthly_Balance           98800 non-null   object 
 27  Credit_Score              100000 non-null  object 
 28  is_train                  100000 non-null  int64  
dtypes: float64(4), int64(5), object(20)
memory usage: 22.1+ MB
In [126]:
df_test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        50000 non-null  object 
 1   Customer_ID               50000 non-null  object 
 2   Month                     50000 non-null  object 
 3   Name                      44985 non-null  object 
 4   Age                       50000 non-null  object 
 5   SSN                       50000 non-null  object 
 6   Occupation                50000 non-null  object 
 7   Annual_Income             50000 non-null  object 
 8   Monthly_Inhand_Salary     42502 non-null  float64
 9   Num_Bank_Accounts         50000 non-null  int64  
 10  Num_Credit_Card           50000 non-null  int64  
 11  Interest_Rate             50000 non-null  int64  
 12  Num_of_Loan               50000 non-null  object 
 13  Type_of_Loan              44296 non-null  object 
 14  Delay_from_due_date       50000 non-null  int64  
 15  Num_of_Delayed_Payment    46502 non-null  object 
 16  Changed_Credit_Limit      50000 non-null  object 
 17  Num_Credit_Inquiries      48965 non-null  float64
 18  Credit_Mix                50000 non-null  object 
 19  Outstanding_Debt          50000 non-null  object 
 20  Credit_Utilization_Ratio  50000 non-null  float64
 21  Credit_History_Age        45530 non-null  object 
 22  Payment_of_Min_Amount     50000 non-null  object 
 23  Total_EMI_per_month       50000 non-null  float64
 24  Amount_invested_monthly   47729 non-null  object 
 25  Payment_Behaviour         50000 non-null  object 
 26  Monthly_Balance           49438 non-null  object 
 27  is_train                  50000 non-null  int64  
dtypes: float64(4), int64(5), object(19)
memory usage: 10.7+ MB
In [127]:
df_group.head()
Out[127]:
Monthly_Inhand_Salary Num_Bank_Accounts Num_Credit_Card Interest_Rate Delay_from_due_date Num_Credit_Inquiries Credit_Utilization_Ratio Total_EMI_per_month Credit_Score is_train ... Occupation_Entrepreneur Occupation_Journalist Occupation_Lawyer Occupation_Manager Occupation_Mechanic Occupation_Media_Manager Occupation_Musician Occupation_Scientist Occupation_Teacher Occupation_Writer
0 1824.843333 3.0 4.0 3.0 3 4.0 26.822620 49.574949 2.0 1 ... False False False False False False False True False False
1 1824.843333 3.0 4.0 3.0 -1 4.0 31.944960 49.574949 2.0 1 ... False False False False False False False True False False
2 1824.843333 3.0 4.0 3.0 3 4.0 28.609352 49.574949 2.0 1 ... False False False False False False False True False False
3 1824.843333 3.0 4.0 3.0 5 4.0 31.377862 49.574949 2.0 1 ... False False False False False False False True False False
4 1824.843333 3.0 4.0 3.0 6 4.0 24.797347 49.574949 2.0 1 ... False False False False False False False True False False

5 rows × 47 columns

In [128]:
from sklearn.preprocessing import LabelEncoder

categorical_columns = df_group.select_dtypes(include=['object', 'category']).columns

for col in categorical_columns:
    le = LabelEncoder()
    df_group[col] = le.fit_transform(df_group[col].astype(str))

print("Converted categorical features into numeric.")
Converted categorical features into numeric.

Modeling¶

Resampling¶

In [129]:
from imblearn.over_sampling import RandomOverSampler

X = df_train.drop(columns=['Credit_Score'])
y = df_train["Credit_Score"]

X, y = RandomOverSampler().fit_resample(X, y)
In [130]:
df_resampling = pd.concat([X, y], axis=1)
plt.figure(figsize=(8, 6))
sns.countplot(x='Credit_Score', data=df_resampling, palette='pastel')
plt.title('Distribution of Target Variable: Credit_Score After Resampling')
plt.xlabel('Credit_Score After Resampling')
plt.ylabel('Count')
plt.show()
<ipython-input-130-dc734f9f83f5>:3: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x='Credit_Score', data=df_resampling, palette='pastel')
No description has been provided for this image

Split¶

In [131]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('X_train',X_train.shape)
print('y_train',y_train.shape)
print('X_test',X_test.shape)
print('y_test',y_test.shape)
X_train (127617, 28)
y_train (127617,)
X_test (31905, 28)
y_test (31905,)

XGBClassifier¶

In [132]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)
In [133]:
df_group.fillna(0, inplace=True)
In [134]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

categorical_features = ['ID', 'Customer_ID', 'Month', 'Name', 'Age', 'SSN', 'Occupation', 'Annual_Income', 'Num_of_Loan', 'Type_of_Loan', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Credit_Mix', 'Outstanding_Debt', 'Credit_History_Age', 'Payment_of_Min_Amount', 'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance']

for feature in categorical_features:
    if feature in X_train.columns and feature in X_test.columns:
        le = LabelEncoder()

        le.fit(pd.concat([X_train[feature], X_test[feature]]).astype(str).unique())

        X_train[feature] = le.transform(X_train[feature].astype(str))
        X_test[feature] = le.transform(X_test[feature].astype(str))
    else:
        print(f"Feature '{feature}' not found in both X_train and X_test")
In [135]:
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

xgb_model.fit(X_train, y_train)

imp = pd.Series(data=xgb_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)

plt.figure(figsize=(10, 12))
sns.barplot(y=imp.index, x=imp.values, palette="Blues_d", orient='h')
plt.title("Feature Importance", fontsize=16)
plt.xlabel("Importance Score", fontsize=14)
plt.ylabel("Features", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
/usr/local/lib/python3.10/dist-packages/xgboost/core.py:158: UserWarning: [07:43:17] WARNING: /workspace/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
<ipython-input-135-5b29f37426dc>:13: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(y=imp.index, x=imp.values, palette="Blues_d", orient='h')
No description has been provided for this image
In [136]:
from sklearn.metrics import accuracy_score, classification_report, log_loss
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score

y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print(classification_report(y_test, y_pred))
print("AUC-ROC Score:", roc_auc_score(y_test, xgb_model.predict_proba(X_test), multi_class='ovr', average='weighted'))
Accuracy: 0.820122237893747
Precision: 0.8192873295571445
Recall: 0.820122237893747
              precision    recall  f1-score   support

           0       0.82      0.93      0.87     10710
           1       0.83      0.86      0.84     10578
           2       0.81      0.67      0.73     10617

    accuracy                           0.82     31905
   macro avg       0.82      0.82      0.82     31905
weighted avg       0.82      0.82      0.82     31905

AUC-ROC Score: 0.9312729492479699
In [137]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

grid_search = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
                           param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Best Parameters XGBClassifier:", grid_search.best_params_)
/usr/local/lib/python3.10/dist-packages/joblib/externals/loky/process_executor.py:752: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/xgboost/core.py:158: UserWarning: [08:41:17] WARNING: /workspace/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
Best Parameters XGBClassifier: {'learning_rate': 0.2, 'max_depth': 10, 'n_estimators': 200, 'subsample': 0.8}
In [138]:
best_params = grid_search.best_params_
optimized_xgb = XGBClassifier(
    use_label_encoder=False,
    eval_metric='mlogloss',
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    learning_rate=best_params['learning_rate'],
    subsample=best_params['subsample']
)

eval_set = [(X_train, y_train), (X_test, y_test)]

optimized_xgb.fit(X_train, y_train, eval_set=eval_set, verbose=True)
results_optimized_xgb = optimized_xgb.evals_result()

epochs = len(results_optimized_xgb['validation_0']['mlogloss'])
x_axis = range(0, epochs)

plt.figure(figsize=(10, 6))
plt.plot(x_axis, results_optimized_xgb['validation_0']['mlogloss'], label='Train')
plt.plot(x_axis, results_optimized_xgb['validation_1']['mlogloss'], label='Validation')
plt.xlabel('Epochs')
plt.ylabel('Log Loss')
plt.title('Training and Validation Log Loss')
plt.legend()
plt.grid()
plt.show()
/usr/local/lib/python3.10/dist-packages/xgboost/core.py:158: UserWarning: [08:48:40] WARNING: /workspace/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
[0]	validation_0-mlogloss:0.97079	validation_1-mlogloss:0.97546
[1]	validation_0-mlogloss:0.87821	validation_1-mlogloss:0.88724
[2]	validation_0-mlogloss:0.80828	validation_1-mlogloss:0.82175
[3]	validation_0-mlogloss:0.75284	validation_1-mlogloss:0.77045
[4]	validation_0-mlogloss:0.70945	validation_1-mlogloss:0.73094
[5]	validation_0-mlogloss:0.67420	validation_1-mlogloss:0.69953
[6]	validation_0-mlogloss:0.64561	validation_1-mlogloss:0.67437
[7]	validation_0-mlogloss:0.62000	validation_1-mlogloss:0.65264
[8]	validation_0-mlogloss:0.59681	validation_1-mlogloss:0.63383
[9]	validation_0-mlogloss:0.57665	validation_1-mlogloss:0.61815
[10]	validation_0-mlogloss:0.55835	validation_1-mlogloss:0.60361
[11]	validation_0-mlogloss:0.54008	validation_1-mlogloss:0.58900
[12]	validation_0-mlogloss:0.52510	validation_1-mlogloss:0.57715
[13]	validation_0-mlogloss:0.51202	validation_1-mlogloss:0.56692
[14]	validation_0-mlogloss:0.49934	validation_1-mlogloss:0.55724
[15]	validation_0-mlogloss:0.48750	validation_1-mlogloss:0.54842
[16]	validation_0-mlogloss:0.47507	validation_1-mlogloss:0.53940
[17]	validation_0-mlogloss:0.46492	validation_1-mlogloss:0.53249
[18]	validation_0-mlogloss:0.45438	validation_1-mlogloss:0.52438
[19]	validation_0-mlogloss:0.44542	validation_1-mlogloss:0.51793
[20]	validation_0-mlogloss:0.43685	validation_1-mlogloss:0.51154
[21]	validation_0-mlogloss:0.42769	validation_1-mlogloss:0.50463
[22]	validation_0-mlogloss:0.41912	validation_1-mlogloss:0.49758
[23]	validation_0-mlogloss:0.41174	validation_1-mlogloss:0.49241
[24]	validation_0-mlogloss:0.40445	validation_1-mlogloss:0.48685
[25]	validation_0-mlogloss:0.39623	validation_1-mlogloss:0.48070
[26]	validation_0-mlogloss:0.39003	validation_1-mlogloss:0.47576
[27]	validation_0-mlogloss:0.38500	validation_1-mlogloss:0.47176
[28]	validation_0-mlogloss:0.38015	validation_1-mlogloss:0.46804
[29]	validation_0-mlogloss:0.37395	validation_1-mlogloss:0.46347
[30]	validation_0-mlogloss:0.36709	validation_1-mlogloss:0.45814
[31]	validation_0-mlogloss:0.36088	validation_1-mlogloss:0.45349
[32]	validation_0-mlogloss:0.35700	validation_1-mlogloss:0.45057
[33]	validation_0-mlogloss:0.35281	validation_1-mlogloss:0.44739
[34]	validation_0-mlogloss:0.34783	validation_1-mlogloss:0.44391
[35]	validation_0-mlogloss:0.34205	validation_1-mlogloss:0.43932
[36]	validation_0-mlogloss:0.33694	validation_1-mlogloss:0.43576
[37]	validation_0-mlogloss:0.33233	validation_1-mlogloss:0.43261
[38]	validation_0-mlogloss:0.32824	validation_1-mlogloss:0.42983
[39]	validation_0-mlogloss:0.32345	validation_1-mlogloss:0.42682
[40]	validation_0-mlogloss:0.31951	validation_1-mlogloss:0.42355
[41]	validation_0-mlogloss:0.31562	validation_1-mlogloss:0.42101
[42]	validation_0-mlogloss:0.31141	validation_1-mlogloss:0.41789
[43]	validation_0-mlogloss:0.30756	validation_1-mlogloss:0.41520
[44]	validation_0-mlogloss:0.30377	validation_1-mlogloss:0.41232
[45]	validation_0-mlogloss:0.29928	validation_1-mlogloss:0.40887
[46]	validation_0-mlogloss:0.29614	validation_1-mlogloss:0.40702
[47]	validation_0-mlogloss:0.29114	validation_1-mlogloss:0.40337
[48]	validation_0-mlogloss:0.28680	validation_1-mlogloss:0.39978
[49]	validation_0-mlogloss:0.28319	validation_1-mlogloss:0.39753
[50]	validation_0-mlogloss:0.27714	validation_1-mlogloss:0.39267
[51]	validation_0-mlogloss:0.27181	validation_1-mlogloss:0.38874
[52]	validation_0-mlogloss:0.26847	validation_1-mlogloss:0.38657
[53]	validation_0-mlogloss:0.26530	validation_1-mlogloss:0.38434
[54]	validation_0-mlogloss:0.26186	validation_1-mlogloss:0.38180
[55]	validation_0-mlogloss:0.25868	validation_1-mlogloss:0.38006
[56]	validation_0-mlogloss:0.25597	validation_1-mlogloss:0.37822
[57]	validation_0-mlogloss:0.25279	validation_1-mlogloss:0.37646
[58]	validation_0-mlogloss:0.24998	validation_1-mlogloss:0.37440
[59]	validation_0-mlogloss:0.24674	validation_1-mlogloss:0.37217
[60]	validation_0-mlogloss:0.24340	validation_1-mlogloss:0.37004
[61]	validation_0-mlogloss:0.23985	validation_1-mlogloss:0.36786
[62]	validation_0-mlogloss:0.23592	validation_1-mlogloss:0.36534
[63]	validation_0-mlogloss:0.23195	validation_1-mlogloss:0.36259
[64]	validation_0-mlogloss:0.22845	validation_1-mlogloss:0.36053
[65]	validation_0-mlogloss:0.22536	validation_1-mlogloss:0.35843
[66]	validation_0-mlogloss:0.22218	validation_1-mlogloss:0.35683
[67]	validation_0-mlogloss:0.21900	validation_1-mlogloss:0.35462
[68]	validation_0-mlogloss:0.21575	validation_1-mlogloss:0.35273
[69]	validation_0-mlogloss:0.21299	validation_1-mlogloss:0.35084
[70]	validation_0-mlogloss:0.20951	validation_1-mlogloss:0.34865
[71]	validation_0-mlogloss:0.20664	validation_1-mlogloss:0.34659
[72]	validation_0-mlogloss:0.20359	validation_1-mlogloss:0.34475
[73]	validation_0-mlogloss:0.20142	validation_1-mlogloss:0.34325
[74]	validation_0-mlogloss:0.19907	validation_1-mlogloss:0.34166
[75]	validation_0-mlogloss:0.19664	validation_1-mlogloss:0.34010
[76]	validation_0-mlogloss:0.19361	validation_1-mlogloss:0.33818
[77]	validation_0-mlogloss:0.19128	validation_1-mlogloss:0.33634
[78]	validation_0-mlogloss:0.18866	validation_1-mlogloss:0.33474
[79]	validation_0-mlogloss:0.18685	validation_1-mlogloss:0.33360
[80]	validation_0-mlogloss:0.18461	validation_1-mlogloss:0.33239
[81]	validation_0-mlogloss:0.18244	validation_1-mlogloss:0.33113
[82]	validation_0-mlogloss:0.18038	validation_1-mlogloss:0.32992
[83]	validation_0-mlogloss:0.17756	validation_1-mlogloss:0.32806
[84]	validation_0-mlogloss:0.17537	validation_1-mlogloss:0.32689
[85]	validation_0-mlogloss:0.17322	validation_1-mlogloss:0.32594
[86]	validation_0-mlogloss:0.17051	validation_1-mlogloss:0.32438
[87]	validation_0-mlogloss:0.16720	validation_1-mlogloss:0.32254
[88]	validation_0-mlogloss:0.16500	validation_1-mlogloss:0.32124
[89]	validation_0-mlogloss:0.16290	validation_1-mlogloss:0.32031
[90]	validation_0-mlogloss:0.16044	validation_1-mlogloss:0.31882
[91]	validation_0-mlogloss:0.15836	validation_1-mlogloss:0.31762
[92]	validation_0-mlogloss:0.15641	validation_1-mlogloss:0.31644
[93]	validation_0-mlogloss:0.15438	validation_1-mlogloss:0.31503
[94]	validation_0-mlogloss:0.15279	validation_1-mlogloss:0.31429
[95]	validation_0-mlogloss:0.15086	validation_1-mlogloss:0.31297
[96]	validation_0-mlogloss:0.14914	validation_1-mlogloss:0.31206
[97]	validation_0-mlogloss:0.14759	validation_1-mlogloss:0.31110
[98]	validation_0-mlogloss:0.14576	validation_1-mlogloss:0.30998
[99]	validation_0-mlogloss:0.14350	validation_1-mlogloss:0.30863
[100]	validation_0-mlogloss:0.14180	validation_1-mlogloss:0.30766
[101]	validation_0-mlogloss:0.13957	validation_1-mlogloss:0.30629
[102]	validation_0-mlogloss:0.13779	validation_1-mlogloss:0.30554
[103]	validation_0-mlogloss:0.13619	validation_1-mlogloss:0.30477
[104]	validation_0-mlogloss:0.13426	validation_1-mlogloss:0.30354
[105]	validation_0-mlogloss:0.13306	validation_1-mlogloss:0.30296
[106]	validation_0-mlogloss:0.13139	validation_1-mlogloss:0.30212
[107]	validation_0-mlogloss:0.12989	validation_1-mlogloss:0.30151
[108]	validation_0-mlogloss:0.12812	validation_1-mlogloss:0.30063
[109]	validation_0-mlogloss:0.12649	validation_1-mlogloss:0.29984
[110]	validation_0-mlogloss:0.12508	validation_1-mlogloss:0.29911
[111]	validation_0-mlogloss:0.12324	validation_1-mlogloss:0.29799
[112]	validation_0-mlogloss:0.12197	validation_1-mlogloss:0.29728
[113]	validation_0-mlogloss:0.12030	validation_1-mlogloss:0.29640
[114]	validation_0-mlogloss:0.11892	validation_1-mlogloss:0.29571
[115]	validation_0-mlogloss:0.11681	validation_1-mlogloss:0.29453
[116]	validation_0-mlogloss:0.11585	validation_1-mlogloss:0.29404
[117]	validation_0-mlogloss:0.11434	validation_1-mlogloss:0.29308
[118]	validation_0-mlogloss:0.11305	validation_1-mlogloss:0.29229
[119]	validation_0-mlogloss:0.11171	validation_1-mlogloss:0.29146
[120]	validation_0-mlogloss:0.11063	validation_1-mlogloss:0.29114
[121]	validation_0-mlogloss:0.10906	validation_1-mlogloss:0.29021
[122]	validation_0-mlogloss:0.10765	validation_1-mlogloss:0.28949
[123]	validation_0-mlogloss:0.10672	validation_1-mlogloss:0.28906
[124]	validation_0-mlogloss:0.10530	validation_1-mlogloss:0.28821
[125]	validation_0-mlogloss:0.10427	validation_1-mlogloss:0.28755
[126]	validation_0-mlogloss:0.10327	validation_1-mlogloss:0.28687
[127]	validation_0-mlogloss:0.10225	validation_1-mlogloss:0.28639
[128]	validation_0-mlogloss:0.10118	validation_1-mlogloss:0.28594
[129]	validation_0-mlogloss:0.10000	validation_1-mlogloss:0.28526
[130]	validation_0-mlogloss:0.09897	validation_1-mlogloss:0.28481
[131]	validation_0-mlogloss:0.09791	validation_1-mlogloss:0.28409
[132]	validation_0-mlogloss:0.09667	validation_1-mlogloss:0.28360
[133]	validation_0-mlogloss:0.09532	validation_1-mlogloss:0.28283
[134]	validation_0-mlogloss:0.09409	validation_1-mlogloss:0.28228
[135]	validation_0-mlogloss:0.09288	validation_1-mlogloss:0.28156
[136]	validation_0-mlogloss:0.09151	validation_1-mlogloss:0.28087
[137]	validation_0-mlogloss:0.09033	validation_1-mlogloss:0.28018
[138]	validation_0-mlogloss:0.08961	validation_1-mlogloss:0.27995
[139]	validation_0-mlogloss:0.08834	validation_1-mlogloss:0.27933
[140]	validation_0-mlogloss:0.08725	validation_1-mlogloss:0.27897
[141]	validation_0-mlogloss:0.08628	validation_1-mlogloss:0.27866
[142]	validation_0-mlogloss:0.08537	validation_1-mlogloss:0.27822
[143]	validation_0-mlogloss:0.08439	validation_1-mlogloss:0.27770
[144]	validation_0-mlogloss:0.08361	validation_1-mlogloss:0.27737
[145]	validation_0-mlogloss:0.08232	validation_1-mlogloss:0.27670
[146]	validation_0-mlogloss:0.08134	validation_1-mlogloss:0.27616
[147]	validation_0-mlogloss:0.08038	validation_1-mlogloss:0.27574
[148]	validation_0-mlogloss:0.07939	validation_1-mlogloss:0.27530
[149]	validation_0-mlogloss:0.07870	validation_1-mlogloss:0.27496
[150]	validation_0-mlogloss:0.07797	validation_1-mlogloss:0.27459
[151]	validation_0-mlogloss:0.07692	validation_1-mlogloss:0.27394
[152]	validation_0-mlogloss:0.07626	validation_1-mlogloss:0.27359
[153]	validation_0-mlogloss:0.07525	validation_1-mlogloss:0.27310
[154]	validation_0-mlogloss:0.07438	validation_1-mlogloss:0.27277
[155]	validation_0-mlogloss:0.07351	validation_1-mlogloss:0.27242
[156]	validation_0-mlogloss:0.07256	validation_1-mlogloss:0.27217
[157]	validation_0-mlogloss:0.07178	validation_1-mlogloss:0.27208
[158]	validation_0-mlogloss:0.07114	validation_1-mlogloss:0.27180
[159]	validation_0-mlogloss:0.07040	validation_1-mlogloss:0.27144
[160]	validation_0-mlogloss:0.06968	validation_1-mlogloss:0.27107
[161]	validation_0-mlogloss:0.06887	validation_1-mlogloss:0.27071
[162]	validation_0-mlogloss:0.06806	validation_1-mlogloss:0.27028
[163]	validation_0-mlogloss:0.06735	validation_1-mlogloss:0.27003
[164]	validation_0-mlogloss:0.06671	validation_1-mlogloss:0.26983
[165]	validation_0-mlogloss:0.06591	validation_1-mlogloss:0.26958
[166]	validation_0-mlogloss:0.06515	validation_1-mlogloss:0.26920
[167]	validation_0-mlogloss:0.06438	validation_1-mlogloss:0.26877
[168]	validation_0-mlogloss:0.06357	validation_1-mlogloss:0.26827
[169]	validation_0-mlogloss:0.06296	validation_1-mlogloss:0.26800
[170]	validation_0-mlogloss:0.06228	validation_1-mlogloss:0.26777
[171]	validation_0-mlogloss:0.06144	validation_1-mlogloss:0.26739
[172]	validation_0-mlogloss:0.06064	validation_1-mlogloss:0.26718
[173]	validation_0-mlogloss:0.05989	validation_1-mlogloss:0.26689
[174]	validation_0-mlogloss:0.05919	validation_1-mlogloss:0.26666
[175]	validation_0-mlogloss:0.05878	validation_1-mlogloss:0.26650
[176]	validation_0-mlogloss:0.05841	validation_1-mlogloss:0.26634
[177]	validation_0-mlogloss:0.05772	validation_1-mlogloss:0.26617
[178]	validation_0-mlogloss:0.05711	validation_1-mlogloss:0.26595
[179]	validation_0-mlogloss:0.05650	validation_1-mlogloss:0.26573
[180]	validation_0-mlogloss:0.05573	validation_1-mlogloss:0.26535
[181]	validation_0-mlogloss:0.05518	validation_1-mlogloss:0.26528
[182]	validation_0-mlogloss:0.05445	validation_1-mlogloss:0.26492
[183]	validation_0-mlogloss:0.05402	validation_1-mlogloss:0.26466
[184]	validation_0-mlogloss:0.05344	validation_1-mlogloss:0.26446
[185]	validation_0-mlogloss:0.05300	validation_1-mlogloss:0.26431
[186]	validation_0-mlogloss:0.05247	validation_1-mlogloss:0.26408
[187]	validation_0-mlogloss:0.05187	validation_1-mlogloss:0.26385
[188]	validation_0-mlogloss:0.05123	validation_1-mlogloss:0.26374
[189]	validation_0-mlogloss:0.05050	validation_1-mlogloss:0.26346
[190]	validation_0-mlogloss:0.05006	validation_1-mlogloss:0.26333
[191]	validation_0-mlogloss:0.04953	validation_1-mlogloss:0.26312
[192]	validation_0-mlogloss:0.04883	validation_1-mlogloss:0.26300
[193]	validation_0-mlogloss:0.04837	validation_1-mlogloss:0.26288
[194]	validation_0-mlogloss:0.04782	validation_1-mlogloss:0.26273
[195]	validation_0-mlogloss:0.04726	validation_1-mlogloss:0.26244
[196]	validation_0-mlogloss:0.04678	validation_1-mlogloss:0.26216
[197]	validation_0-mlogloss:0.04632	validation_1-mlogloss:0.26201
[198]	validation_0-mlogloss:0.04577	validation_1-mlogloss:0.26198
[199]	validation_0-mlogloss:0.04538	validation_1-mlogloss:0.26183
No description has been provided for this image
In [142]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Assuming categorical_features contains your list of object type columns
categorical_features = ['ID', 'Customer_ID', 'Month', 'Name', 'Age', 'SSN', 'Occupation', 'Annual_Income', 'Num_of_Loan', 'Type_of_Loan', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Credit_Mix', 'Outstanding_Debt', 'Credit_History_Age', 'Payment_of_Min_Amount', 'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance']

for feature in categorical_features:
    if feature in X_train.columns and feature in X_test.columns:
        le = LabelEncoder()
        # Fit on the combined unique values from both train and test
        le.fit(pd.concat([X_train[feature], X_test[feature]]).astype(str).unique())
        X_train[feature] = le.transform(X_train[feature].astype(str))
        X_test[feature] = le.transform(X_test[feature].astype(str))
    else:
        print(f"Feature '{feature}' not found in both X_train and X_test")
<ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[feature] = le.transform(X_test[feature].astype(str))
<ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[feature] = le.transform(X_test[feature].astype(str))
<ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[feature] = le.transform(X_test[feature].astype(str))
<ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[feature] = le.transform(X_test[feature].astype(str))
<ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[feature] = le.transform(X_test[feature].astype(str))
<ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[feature] = le.transform(X_test[feature].astype(str))
<ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[feature] = le.transform(X_test[feature].astype(str))
<ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[feature] = le.transform(X_test[feature].astype(str))
<ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[feature] = le.transform(X_test[feature].astype(str))
<ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[feature] = le.transform(X_test[feature].astype(str))
<ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[feature] = le.transform(X_test[feature].astype(str))
<ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[feature] = le.transform(X_test[feature].astype(str))
<ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[feature] = le.transform(X_test[feature].astype(str))
<ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[feature] = le.transform(X_test[feature].astype(str))
<ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[feature] = le.transform(X_test[feature].astype(str))
<ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[feature] = le.transform(X_test[feature].astype(str))
<ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[feature] = le.transform(X_test[feature].astype(str))
<ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[feature] = le.transform(X_test[feature].astype(str))
<ipython-input-142-7d25b58549fd>:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-142-7d25b58549fd>:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[feature] = le.transform(X_test[feature].astype(str))
In [152]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Assuming categorical_features contains your list of object type columns
categorical_features = ['ID', 'Customer_ID', 'Month', 'Name', 'Age', 'SSN', 'Occupation', 'Annual_Income', 'Num_of_Loan', 'Type_of_Loan', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Credit_Mix', 'Outstanding_Debt', 'Credit_History_Age', 'Payment_of_Min_Amount', 'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance']

# Ensure X_test has the same columns as X_train used during training
X_test = X_test[[col for col in X_train.columns if col in X_test.columns]]

for feature in categorical_features:
    if feature in X_train.columns and feature in X_test.columns:
        le = LabelEncoder()
        # Fit on the combined unique values from both train and test
        le.fit(pd.concat([X_train[feature], X_test[feature]]).astype(str).unique())
        X_train[feature] = le.transform(X_train[feature].astype(str))
        X_test[feature] = le.transform(X_test[feature].astype(str))
    else:
        print(f"Feature '{feature}' not found in both X_train and X_test")
<ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-152-65e1e9859ddb>:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
In [153]:
y_pred = optimized_xgb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print(classification_report(y_test, y_pred))
print("AUC-ROC Score:", roc_auc_score(y_test, optimized_xgb.predict_proba(X_test), multi_class='ovr', average='weighted'))
Accuracy: 0.6871114299378288
Precision: 0.6917502759377729
Recall: 0.6871114299378288
              precision    recall  f1-score   support

           0       0.62      0.74      0.67      1465
           1       0.64      0.65      0.65      2488
           2       0.74      0.69      0.72      4411

    accuracy                           0.69      8364
   macro avg       0.67      0.69      0.68      8364
weighted avg       0.69      0.69      0.69      8364

AUC-ROC Score: 0.8248599898132657
In [157]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

X_Kfold = X_train
y_Kfold = y_train

optimized_xgb_kfold = optimized_xgb

# K-Fold Cross Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Manual K-Fold
f1_scores, precision_scores, recall_scores = [], [], []

# --- Fit LabelEncoders outside the loop ---
categorical_features = X.select_dtypes(include=['object']).columns
label_encoders = {}  # Store LabelEncoders for each feature

for feature in categorical_features:
    label_encoders[feature] = LabelEncoder()
    # Fit on all data to avoid unseen labels
    label_encoders[feature].fit(X[feature].astype(str).unique())

# --- Now iterate through folds ---
for train_index, test_index in kf.split(X_Kfold, y_Kfold):
    # Select data using .iloc
    X_train_fold = X.iloc[train_index]
    X_test_fold = X.iloc[test_index]
    y_train_fold = y.iloc[train_index]
    y_test_fold = y.iloc[test_index]

    # --- Apply Label Encoding using pre-fitted encoders ---
    for feature in categorical_features:
        X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
        X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))

    # --- Convert all data to float after encoding ---
    X_train_fold = X_train_fold.astype(float)
    X_test_fold = X_test_fold.astype(float)

    # Encode the target variable within the loop
    le = LabelEncoder()
    y_train_fold = le.fit_transform(y_train_fold)
    y_test_fold = le.transform(y_test_fold)

    optimized_xgb_kfold.fit(X_train_fold, y_train_fold)  # Use the converted data
    y_pred = optimized_xgb_kfold.predict(X_test_fold)  # Use the converted data

    f1_scores.append(f1_score(y_test_fold, y_pred, average='weighted'))
    precision_scores.append(precision_score(y_test_fold, y_pred, average='weighted'))
    recall_scores.append(recall_score(y_test_fold, y_pred, average='weighted'))

print("Mean F1-Score:", np.mean(f1_scores))
print("Mean Precision:", np.mean(precision_scores))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
/usr/local/lib/python3.10/dist-packages/xgboost/core.py:158: UserWarning: [09:01:03] WARNING: /workspace/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
/usr/local/lib/python3.10/dist-packages/xgboost/core.py:158: UserWarning: [09:01:25] WARNING: /workspace/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
/usr/local/lib/python3.10/dist-packages/xgboost/core.py:158: UserWarning: [09:01:48] WARNING: /workspace/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
/usr/local/lib/python3.10/dist-packages/xgboost/core.py:158: UserWarning: [09:02:05] WARNING: /workspace/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold[feature] = label_encoders[feature].transform(X_train_fold[feature].astype(str))
<ipython-input-157-30743ffd0d62>:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fold[feature] = label_encoders[feature].transform(X_test_fold[feature].astype(str))
/usr/local/lib/python3.10/dist-packages/xgboost/core.py:158: UserWarning: [09:02:22] WARNING: /workspace/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
Mean F1-Score: 0.795492429540106
Mean Precision: 0.795518354123495
In [159]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Assuming categorical_features contains your list of object type columns
categorical_features = ['ID', 'Customer_ID', 'Month', 'Name', 'Age', 'SSN', 'Occupation', 'Annual_Income', 'Num_of_Loan', 'Type_of_Loan', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Credit_Mix', 'Outstanding_Debt', 'Credit_History_Age', 'Payment_of_Min_Amount', 'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance']

# Ensure X_test has the same columns as X_train used during training
# This line is crucial to ensure compatibility
X_test = X_test[[col for col in X_train.columns if col in X_test.columns]]

for feature in categorical_features:
    if feature in X_train.columns and feature in X_test.columns:
        le = LabelEncoder()
        # Fit on the combined unique values from both train and test
        le.fit(pd.concat([X_train[feature], X_test[feature]]).astype(str).unique())
        X_train[feature] = le.transform(X_train[feature].astype(str))
        X_test[feature] = le.transform(X_test[feature].astype(str))
    else:
        print(f"Feature '{feature}' not found in both X_train and X_test")

# --- Convert all data to float after encoding ---
# This line ensures XGBoost compatibility
X_test = X_test.astype(float)
<ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))
<ipython-input-159-379acd4c14c7>:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.transform(X_train[feature].astype(str))

RFC try¶

In [160]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Plot features importances
imp = pd.Series(data=rf_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
plt.figure(figsize=(10,12))
plt.title("Feature importance")
ax = sns.barplot(y=imp.index, x=imp.values, palette="Blues_d", orient='h')
<ipython-input-160-7a7af09c5d56>:10: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(y=imp.index, x=imp.values, palette="Blues_d", orient='h')
No description has been provided for this image
In [161]:
from sklearn.ensemble import RandomForestClassifier

train_scores = []
val_scores = []

for n_estimators in range(10, 201, 10):
    rf_model = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
    rf_model.fit(X_train, y_train)


    train_pred = rf_model.predict(X_train)
    train_acc = accuracy_score(y_train, train_pred)
    train_scores.append(train_acc)

    val_pred = rf_model.predict(X_test)
    val_acc = accuracy_score(y_test, val_pred)
    val_scores.append(val_acc)

plt.figure(figsize=(10, 6))
plt.plot(range(10, 201, 10), train_scores, label='Train Accuracy')
plt.plot(range(10, 201, 10), val_scores, label='Validation Accuracy')
plt.xlabel('Number of Trees (n_estimators)')
plt.ylabel('Accuracy')
plt.title('Random Forest Training vs Validation Accuracy')
plt.legend()
plt.grid()
plt.show()
No description has been provided for this image
In [162]:
final_rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
final_rf_model.fit(X_train, y_train)
y_pred = final_rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print(classification_report(y_test, y_pred))
print("AUC-ROC Score:", roc_auc_score(y_test, final_rf_model.predict_proba(X_test), multi_class='ovr', average='weighted'))
Accuracy: 0.7882229861007324
Precision: 0.7883267561389053
Recall: 0.7882229861007324
              precision    recall  f1-score   support

           0       0.73      0.71      0.72      1161
           1       0.78      0.81      0.80      1908
           2       0.81      0.80      0.81      3622

    accuracy                           0.79      6691
   macro avg       0.77      0.78      0.77      6691
weighted avg       0.79      0.79      0.79      6691

AUC-ROC Score: 0.8948954966339161

LGBMClassifier¶

In [163]:
import lightgbm as lgb

lgbm = lgb.LGBMClassifier(objective='multiclass', random_state=42)
lgbm.fit(X_train, y_train)

# Plot features importances
imp = pd.Series(data=rf_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
plt.figure(figsize=(10,12))
plt.title("Feature importance")
ax = sns.barplot(y=imp.index, x=imp.values, palette="Blues_d", orient='h')
/usr/local/lib/python3.10/dist-packages/dask/dataframe/__init__.py:42: FutureWarning: 
Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

  warnings.warn(msg, FutureWarning)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003266 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5084
[LightGBM] [Info] Number of data points in the train set: 26761, number of used features: 27
[LightGBM] [Info] Start training from score -1.714587
[LightGBM] [Info] Start training from score -1.222110
[LightGBM] [Info] Start training from score -0.643683
<ipython-input-163-ba426aead1c0>:10: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(y=imp.index, x=imp.values, palette="Blues_d", orient='h')
No description has been provided for this image
In [164]:
lgbm.fit(X_train, y_train, eval_metric='multi_logloss', eval_set=eval_set)
results_lgbm = lgbm.evals_result_

epochs = len(results_lgbm['valid_0']['multi_logloss'])
x_axis = range(0, epochs)

plt.figure(figsize=(10, 6))
plt.plot(x_axis, results_lgbm['valid_0']['multi_logloss'], label='Train')
plt.plot(x_axis, results_lgbm['valid_1']['multi_logloss'], label='Validation')
plt.xlabel('Epochs')
plt.ylabel('Log Loss')
plt.title('Training and Validation Log Loss')
plt.legend()
plt.grid()
plt.show()
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002209 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5084
[LightGBM] [Info] Number of data points in the train set: 26761, number of used features: 27
[LightGBM] [Info] Start training from score -1.714587
[LightGBM] [Info] Start training from score -1.222110
[LightGBM] [Info] Start training from score -0.643683
No description has been provided for this image
In [165]:
y_pred = lgbm.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print(classification_report(y_test, y_pred))
print("AUC-ROC Score:", roc_auc_score(y_test, lgbm.predict_proba(X_test), multi_class='ovr', average='weighted'))
Accuracy: 0.7587804513525631
Precision: 0.7603072631063886
Recall: 0.7587804513525631
              precision    recall  f1-score   support

           0       0.68      0.73      0.70      1161
           1       0.75      0.75      0.75      1908
           2       0.79      0.77      0.78      3622

    accuracy                           0.76      6691
   macro avg       0.74      0.75      0.74      6691
weighted avg       0.76      0.76      0.76      6691

AUC-ROC Score: 0.8787710274426586
In [170]:
# @title
# prompt: target variable is credit_score make a deep learning model using pytoarch add more layers cause the model is underfitting the loss is nan i need good accuracy

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Define the neural network architecture with more layers
class CreditScoreClassifier(nn.Module):
    def __init__(self, input_size):
        super(CreditScoreClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)  # Increased neurons
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.3) # Add dropout for regularization
        self.fc2 = nn.Linear(256, 128)  # Added another layer
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.3) # Add dropout
        self.fc3 = nn.Linear(128, 64) # Added another layer
        self.relu3 = nn.ReLU()
        self.fc4 = nn.Linear(64, 3) # Output layer (3 classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.fc4(x)
        return x

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)  # Use long for class labels
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Create datasets and dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)  # Adjust batch size

test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Initialize the model, loss function, and optimizer
input_size = X_train.shape[1]
model = CreditScoreClassifier(input_size)
criterion = nn.CrossEntropyLoss()  # Use CrossEntropyLoss for multi-class classification
optimizer = optim.Adam(model.parameters(), lr=0.001) # Use Adam optimizer

# Training loop with more epochs
num_epochs = 50  # Increase the number of epochs
for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        if (i + 1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')

# Evaluation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f'Test Accuracy: {accuracy:.2f}%')
Epoch [1/100], Loss: nan
Epoch [2/100], Loss: nan
Epoch [3/100], Loss: nan
Epoch [4/100], Loss: nan
Epoch [5/100], Loss: nan
Epoch [6/100], Loss: nan
Epoch [7/100], Loss: nan
Epoch [8/100], Loss: nan
Epoch [9/100], Loss: nan
Epoch [10/100], Loss: nan
Epoch [11/100], Loss: nan
Epoch [12/100], Loss: nan
Epoch [13/100], Loss: nan
Epoch [14/100], Loss: nan
Epoch [15/100], Loss: nan
Epoch [16/100], Loss: nan
Epoch [17/100], Loss: nan
Epoch [18/100], Loss: nan
Epoch [19/100], Loss: nan
Epoch [20/100], Loss: nan
Epoch [21/100], Loss: nan
Epoch [22/100], Loss: nan
Epoch [23/100], Loss: nan
Epoch [24/100], Loss: nan
Epoch [25/100], Loss: nan
Epoch [26/100], Loss: nan
Epoch [27/100], Loss: nan
Epoch [28/100], Loss: nan
Epoch [29/100], Loss: nan
Epoch [30/100], Loss: nan
Epoch [31/100], Loss: nan
Epoch [32/100], Loss: nan
Epoch [33/100], Loss: nan
Epoch [34/100], Loss: nan
Epoch [35/100], Loss: nan
Epoch [36/100], Loss: nan
Epoch [37/100], Loss: nan
Epoch [38/100], Loss: nan
Epoch [39/100], Loss: nan
Epoch [40/100], Loss: nan
Epoch [41/100], Loss: nan
Epoch [42/100], Loss: nan
Epoch [43/100], Loss: nan
Epoch [44/100], Loss: nan
Epoch [45/100], Loss: nan
Epoch [46/100], Loss: nan
Epoch [47/100], Loss: nan
Epoch [48/100], Loss: nan
Epoch [49/100], Loss: nan
Epoch [50/100], Loss: nan
Epoch [51/100], Loss: nan
Epoch [52/100], Loss: nan
Epoch [53/100], Loss: nan
Epoch [54/100], Loss: nan
Epoch [55/100], Loss: nan
Epoch [56/100], Loss: nan
Epoch [57/100], Loss: nan
Epoch [58/100], Loss: nan
Epoch [59/100], Loss: nan
Epoch [60/100], Loss: nan
Epoch [61/100], Loss: nan
Epoch [62/100], Loss: nan
Epoch [63/100], Loss: nan
Epoch [64/100], Loss: nan
Epoch [65/100], Loss: nan
Epoch [66/100], Loss: nan
Epoch [67/100], Loss: nan
Epoch [68/100], Loss: nan
Epoch [69/100], Loss: nan
Epoch [70/100], Loss: nan
Epoch [71/100], Loss: nan
Epoch [72/100], Loss: nan
Epoch [73/100], Loss: nan
Epoch [74/100], Loss: nan
Epoch [75/100], Loss: nan
Epoch [76/100], Loss: nan
Epoch [77/100], Loss: nan
Epoch [78/100], Loss: nan
Epoch [79/100], Loss: nan
Epoch [80/100], Loss: nan
Epoch [81/100], Loss: nan
Epoch [82/100], Loss: nan
Epoch [83/100], Loss: nan
Epoch [84/100], Loss: nan
Epoch [85/100], Loss: nan
Epoch [86/100], Loss: nan
Epoch [87/100], Loss: nan
Epoch [88/100], Loss: nan
Epoch [89/100], Loss: nan
Epoch [90/100], Loss: nan
Epoch [91/100], Loss: nan
Epoch [92/100], Loss: nan
Epoch [93/100], Loss: nan
Epoch [94/100], Loss: nan
Epoch [95/100], Loss: nan
Epoch [96/100], Loss: nan
Epoch [97/100], Loss: nan
Epoch [98/100], Loss: nan
Epoch [99/100], Loss: nan
Epoch [100/100], Loss: nan
Accuracy of the model on the test images: 17.35166641757585%
In [174]:
# @title
# Get the feature names used during training
training_features = optimized_xgb.get_booster().feature_names

# Ensure training_features are present in data_test
# Instead of filtering, reorder data_test_subset columns to match training_features
data_test_subset = data_test[[f for f in training_features if f in data_test.columns]]

# Now, make the prediction
y_pred_final = optimized_xgb.predict(data_test_subset)
y_pred_final
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-174-128d13539418> in <cell line: 9>()
      7 
      8 # Now, make the prediction
----> 9 y_pred_final = optimized_xgb.predict(data_test_subset)
     10 y_pred_final

/usr/local/lib/python3.10/dist-packages/xgboost/sklearn.py in predict(self, X, output_margin, validate_features, base_margin, iteration_range)
   1563     ) -> ArrayLike:
   1564         with config_context(verbosity=self.verbosity):
-> 1565             class_probs = super().predict(
   1566                 X=X,
   1567                 output_margin=output_margin,

/usr/local/lib/python3.10/dist-packages/xgboost/sklearn.py in predict(self, X, output_margin, validate_features, base_margin, iteration_range)
   1184             if self._can_use_inplace_predict():
   1185                 try:
-> 1186                     predts = self.get_booster().inplace_predict(
   1187                         data=X,
   1188                         iteration_range=iteration_range,

/usr/local/lib/python3.10/dist-packages/xgboost/core.py in inplace_predict(self, data, iteration_range, predict_type, missing, validate_features, base_margin, strict_shape)
   2512             data, fns, _ = _transform_pandas_df(data, enable_categorical)
   2513             if validate_features:
-> 2514                 self._validate_features(fns)
   2515         if _is_list(data) or _is_tuple(data):
   2516             data = np.array(data)

/usr/local/lib/python3.10/dist-packages/xgboost/core.py in _validate_features(self, feature_names)
   3077                 )
   3078 
-> 3079             raise ValueError(msg.format(self.feature_names, feature_names))
   3080 
   3081     def get_split_value_histogram(

ValueError: feature_names mismatch: ['ID', 'Customer_ID', 'Month', 'Name', 'Age', 'SSN', 'Occupation', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Type_of_Loan', 'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt', 'Credit_Utilization_Ratio', 'Credit_History_Age', 'Payment_of_Min_Amount', 'Total_EMI_per_month', 'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance', 'is_train'] ['Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'Delay_from_due_date', 'Num_Credit_Inquiries', 'Credit_Utilization_Ratio', 'Total_EMI_per_month']
expected ID, Type_of_Loan, Occupation, Annual_Income, Age, Payment_Behaviour, is_train, Name, Changed_Credit_Limit, Num_of_Loan, Num_of_Delayed_Payment, Credit_History_Age, Monthly_Balance, Outstanding_Debt, Amount_invested_monthly, SSN, Credit_Mix, Customer_ID, Payment_of_Min_Amount, Month in input data
In [ ]:
# @title
Credit_Score_to_Label = {
    2:'Good',
    1:'Standard',
    0:'Poor'
}

y_pred_credit_score = np.vectorize(Credit_Score_to_Label.get)(y_pred_final)
y_pred_credit_score
In [ ]:
# @title
submission = pd.DataFrame({
    'ID': df_test['ID'],
    'Credit_Score': y_pred_credit_score
})
submission
In [ ]:
!jupyter nbconvert --to html "/content/drive/MyDrive/Colab Notebooks/IS675_lab02.ipynb"
In [ ]:
# @title
plt.figure(figsize=(8, 6))
sns.countplot(x='Credit_Score', data=submission, palette='pastel')
plt.title('Distribution of Target Variable: Credit_Score on Test')
plt.xlabel('Credit_Score on Test')
plt.ylabel('Count')
plt.show()